In [173]:
import pandas as pd
import numpy as np
from src import configuration as config
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

In [174]:
# load the data
df = config.load_traindata_for_pointwise()
df = df.drop(columns=['cv_score'])
print(df.dtypes)
df.head()

dataset      int64
model       object
tuning      object
scoring     object
encoder     object
rank       float64
dtype: object


Unnamed: 0,dataset,model,tuning,scoring,encoder,rank
0,1169,KNC,model,ACC,BUCV2RGLMME,16.0
1,1169,KNC,model,ACC,BUCV2TE,14.0
2,1169,KNC,model,ACC,CBE,22.0
3,1169,KNC,model,ACC,CE,23.0
4,1169,KNC,model,ACC,CV10RGLMME,7.0


In [175]:
def create_encoder_rankings(df):
    # Group the DataFrame by 'dataset', 'model', 'tuning', and 'scoring' columns
    grouped_df = df.groupby(['dataset', 'model', 'tuning', 'scoring'])
    
    # Create a new DataFrame to store the results
    new_df = pd.DataFrame(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder_rankings'])
    
    for group_keys, group_data in grouped_df:
        dataset, model, tuning, scoring = group_keys
        encoder_rankings = group_data.sort_values('rank', ascending=False)['encoder'].tolist()
        rankings = group_data.sort_values('rank', ascending=False)['rank'].tolist()
        new_row = {'dataset': dataset, 'model': model, 'tuning': tuning, 'scoring': scoring,
                   'encoder_rankings': [encoder_rankings], 'ranking': [rankings]}
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
    
    return new_df

df_listwise = create_encoder_rankings(df)
df_listwise.head()

Unnamed: 0,dataset,model,tuning,scoring,encoder_rankings,ranking
0,3,DTC,full,ACC,"[[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
1,3,DTC,full,AUC,"[[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
2,3,DTC,full,F1,"[[CBE, DE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
3,3,DTC,model,AUC,"[[DE, CBE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18..."
4,3,DTC,model,F1,"[[CBE, DE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18..."


In [176]:
# concat dataset model tuning scoring into one column
df_listwise['dataset_model_tuning_scoring'] = df_listwise['dataset'].astype(str) + ' ' + df_listwise['model'] + ' ' + df_listwise['tuning'] + ' ' + df_listwise['scoring']
# drop everthing but ranking and dataset_model_tuning_scoring
df_listwise = df_listwise.drop(columns=['dataset', 'model', 'tuning', 'scoring'])
df_listwise.head()

Unnamed: 0,encoder_rankings,ranking,dataset_model_tuning_scoring
0,"[[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",3 DTC full ACC
1,"[[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",3 DTC full AUC
2,"[[CBE, DE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",3 DTC full F1
3,"[[DE, CBE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18...",3 DTC model AUC
4,"[[CBE, DE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18...",3 DTC model F1


In [177]:
# convert to a array containing all unique combinations of model, tuning, scoring as byte strings
# unique_factor_combinations = np.unique(df_listwise[['model', 'tuning', 'scoring']])
# unique_factor_combinations = unique_factor_combinations.astype('S')
# print(unique_factor_combinations)

# unique_model_combinations = np.unique(df_listwise['model'])
# unique_model_combinations = unique_factor_combinations.astype('S')

# unique_tuning_combinations = np.unique(df_listwise['tuning'])
# unique_tuning_combinations = unique_factor_combinations.astype('S')

unique_factor_combinations = np.unique(df_listwise[['dataset_model_tuning_scoring']])
unique_factor_combinations = unique_factor_combinations.astype('S')
print(unique_factor_combinations)

unique_encoder_rankings = np.unique(df[['encoder']])
#unique_encoder_rankings = unique_encoder_rankings.astype('S')
print(unique_encoder_rankings)

[b'1037 DTC full AUC' b'1037 DTC full F1' b'1037 DTC model ACC' ...
 b'981 SVC no ACC' b'981 SVC no AUC' b'981 SVC no F1']
['BE' 'BUCV10RGLMME' 'BUCV10TE' 'BUCV2RGLMME' 'BUCV2TE' 'BUCV5RGLMME'
 'BUCV5TE' 'CBE' 'CE' 'CV10RGLMME' 'CV10TE' 'CV2RGLMME' 'CV2TE'
 'CV5RGLMME' 'CV5TE' 'DE' 'DTEM10' 'DTEM2' 'DTEM5' 'ME01E' 'ME10E' 'ME1E'
 'MHE' 'OE' 'OHE' 'PBTE0001' 'PBTE001' 'PBTE01' 'RGLMME' 'SE' 'TE' 'WOEE']


In [178]:
def stack_dict(inputs, fun=tf.stack):
    values = []
    for key in sorted(inputs.keys()):
      values.append(tf.cast(inputs[key], tf.float32))

    return fun(values, axis=-1)

class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32
    print("STARTING INIT")
    # Compute embeddings for factor combinations.
    self.factors_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_factor_combinations),
      tf.keras.layers.Embedding(len(unique_factor_combinations) + 2, embedding_dimension)
    ])
    
    # # Compute embeddings for factor combinations.
    # self.model_embeddings = tf.keras.Sequential([
    #   tf.keras.layers.StringLookup(
    #     vocabulary=unique_model_combinations),
    #   tf.keras.layers.Embedding(len(unique_model_combinations) + 2, embedding_dimension)
    # ])
    
    # # Compute embeddings for factor combinations.
    # self.tuning_embeddings = tf.keras.Sequential([
    #   tf.keras.layers.StringLookup(
    #     vocabulary=unique_tuning_combinations),
    #   tf.keras.layers.Embedding(len(unique_tuning_combinations) + 2, embedding_dimension)
    # ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )
    print("FINISHED INIT")

  def call(self, features):
    # We first convert the id features into embeddings.
    print("We are in call")
    
    factors = ["model", "tuning"]
    print("Factors can be printed {}", factors)
    print(f"The type of features is {type(features)}")
    # #factors = ("model", "tuning", "scoring")
    # #factors = tuple(features[f].ref() for f in factors)  # convert factors list to tuple
    
    factors_embeddings = self.factors_embeddings(features["dataset_model_tuning_scoring"])
    
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    #model_embeddings = self.model_embeddings(features["model"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    #tuning_embeddings = self.tuning_embeddings(features["tuning"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    print(features)
    print(features["model"])
    #list_length = features["model"].shape[1]
    #user_embedding_repeated = tf.repeat(
    #    tf.expand_dims(tuning_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    # concatenated_embeddings = tf.concat(
    #     [tuning_embeddings, model_embeddings], 2)

    # user_emb = tf.expand_dims(model_embeddings, axis=1)
    # movie_emb = tf.expand_dims(tuning_embeddings, axis=1)
    # inputs = [user_emb, movie_emb]
    # concatenated_embeddings = tf.concat(inputs, axis=1)
    # return self.score_model(concatenated_embeddings)
    return self.score_model(factors_embeddings)

  def compute_loss(self, features, training=False):
    print("Computing Loss")
    labels = features.pop("dataset")  
    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [179]:
epochs = 30

# cached_train = train.shuffle(100_000).batch(8192).cache()
# cached_test = test.batch(4096).cache()

In [180]:
train_df = df_listwise
train_df.drop("encoder_rankings", axis=1, inplace=True)
train_df.drop("scoring", axis=1, inplace=True)
train_df.drop("ranking", axis=1, inplace=True)
train_df.head()

KeyError: "['scoring'] not found in axis"

In [None]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

STARTING INIT
FINISHED INIT


In [None]:
numerical_feature_names = ['dataset']
categorical_feature_names = ['model']
list_feature_names = ['encoder_rankings']
df = df.drop(columns=['rank', 'scoring', 'encoder', 'tuning'])

from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

test_df = df.copy()
# Fit and transform the string column to obtain encoded values
test_df['model'] = label_encoder.fit_transform(df['model'])
#test_df['tuning'] = label_encoder.fit_transform(df['tuning'])

print(test_df.dtypes)

# training_dataset = (
#     tf.data.Dataset.from_tensor_slices(
#         (
#             tf.cast(test_df[categorical_feature_names].values, tf.float32),
#             tf.cast(test_df['dataset'].values, tf.int32)
#         )
#     )
# )

training_dataset = tf.data.Dataset.from_tensor_slices(dict(df_listwise))

dataset    int64
model      int32
dtype: object


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [None]:
for row in training_dataset.take(1):
  print(row)

{'dataset': <tf.Tensor: shape=(), dtype=int64, numpy=1169>, 'model': <tf.Tensor: shape=(), dtype=int32, numpy=1>}


<_TensorSliceDataset element_spec={'model': TensorSpec(shape=(), dtype=tf.string, name=None), 'tuning': TensorSpec(shape=(), dtype=tf.string, name=None), 'scoring': TensorSpec(shape=(), dtype=tf.string, name=None)}>

<CacheDataset element_spec={'user_id': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(None, 5), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(None, 5), dtype=tf.float32, name=None)}>

In [None]:
listwise_model.fit(training_dataset, epochs=2, verbose=True)

Epoch 1/2
Computing Loss
We are in call
Factors can be printed {} ['model', 'tuning']
The type of features is <class 'dict'>
{'model': <tf.Tensor 'IteratorGetNext:1' shape=() dtype=int32>}
Tensor("IteratorGetNext:1", shape=(), dtype=int32)


ValueError: in user code:

    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\tensorflow_recommenders\models\base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "C:\Users\Marco\AppData\Local\Temp\ipykernel_21944\118192996.py", line 98, in compute_loss
        scores = self(features)
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Marco\AppData\Local\Temp\__autograph_generated_file6w8k9rel.py", line 22, in tf__call
        raise

    ValueError: Exception encountered when calling layer 'ranking_model_12' (type RankingModel).
    
    in user code:
    
        File "C:\Users\Marco\AppData\Local\Temp\ipykernel_21944\118192996.py", line 93, in call  *
            return self.score_model(model_embeddings)
        File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\engine\input_spec.py", line 253, in assert_input_compatibility
            raise ValueError(
    
        ValueError: Exception encountered when calling layer 'sequential_51' (type Sequential).
        
        Input 0 of layer "dense_36" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (32,)
        
        Call arguments received by layer 'sequential_51' (type Sequential):
          • inputs=tf.Tensor(shape=(32,), dtype=float32)
          • training=None
          • mask=None
    
    
    Call arguments received by layer 'ranking_model_12' (type RankingModel):
      • features={'model': 'tf.Tensor(shape=(), dtype=int32)'}
