In [None]:
!pip install --user -q seaborn
!pip install --user hyperopt

In [None]:
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K
import seaborn as sns
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, Trials
from hyperopt import STATUS_OK
from hyperopt import space_eval
from functools import partial

Dataset loading

In [None]:
train_df_path = './MultipleFeaturesDS/DS_CoA_Training.csv'
test_df_path = './MultipleFeaturesDS/DS_CoA_Testing.csv'

# Loads the dataset in a Pandas Dataframe
train_df = pd.read_csv(train_df_path).sample(frac=1)
test_df = pd.read_csv(test_df_path).sample(frac=1)
train_df.tail()

In [None]:
train_df.describe().transpose()

In [None]:
train_labels = train_df.pop('sas_rating')
test_labels = test_df.pop('sas_rating')

# Convert the dataframe into a TendorFlow Dataset
#train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="sas_rating", task=2)

In [None]:
def input_setup(df):
  df_houses = df.iloc[:, 0:7]
  df_type_counts = df.iloc[:, 799:804]
  df_deck = pd.concat([df_houses, df_type_counts], axis=1)

  df_cards = df.drop(df_houses, axis=1)
  df_cards.drop(df_type_counts, axis=1, inplace=True)

  tensor_cards = tf.convert_to_tensor(df_cards)
  tensor_cards = tf.reshape(tensor_cards, (len(df_cards), 36, 22))

  return [tensor_cards, df_deck]

# Building the **Keras Model**

Prepare **hyperparameters** using Hyperopt

In [None]:
# Define the HPs
search_space = {
  'dense_a_size': hp.choice('dense_a_size', np.arange(0, 100, 10)),
  'dense_b_size': hp.choice('dense_b_size', np.arange(0, 100, 10)),
  'dense_one_size': hp.choice('dense_one_size', np.arange(0, 100, 10)),
  'dense_two_size': hp.choice('dense_two_size', np.arange(0, 100, 10)),
  'dropout_b_rate': hp.uniform('dropout_b_rate', 0.0, 0.3),
  'dropout_one_rate': hp.uniform('dropout_one_rate', 0.0, 0.3),
  'dropout_two_rate': hp.uniform('dropout_two_rate', 0.0, 0.3),
  'opt_learning_rate': hp.loguniform('opt_learning_rate', -10, 0)
}

In [None]:
class WeightedSumByDotProduct(keras.layers.Layer):
    def __init__(self):
        super(WeightedSumByDotProduct, self).__init__()

    def call(self, info, a):
        return K.batch_dot(info, a, axes=1)

In [None]:
def build_and_compile_model(params):

  # Setup model branches
  input_card_info = layers.Input(shape=(36,22))
  input_deck_info = layers.Input(shape=(11,))

  # Branch A (card info)
  dense_layer = layers.Dense(params['dense_a_size'], activation='relu')
  a = layers.TimeDistributed(dense_layer)(input_card_info)
  single_dense_layer = layers.Dense(1)
  a = layers.TimeDistributed(single_dense_layer)(a)
  a = layers.Flatten()(a)
  a = layers.Softmax()(a)
  a = WeightedSumByDotProduct()(input_card_info, a)
  a = keras.models.Model(inputs=input_card_info, outputs=a)

  # Branch B (deck info)
  b = layers.Dense(params['dense_b_size'], activation='relu')(input_deck_info)
  b = layers.Dropout(params['dropout_b_rate'])(b)
  b = keras.models.Model(inputs=input_deck_info, outputs=b)

  # Concat the 2 outputs
  combined = layers.concatenate([a.output, b.output])

  # Last layers of final Model
  z = layers.Dense(params['dense_one_size'], activation='relu')(combined)
  z = layers.Dropout(params['dropout_one_rate'])(z)
  z = layers.Dense(params['dense_two_size'], activation='relu')(z)
  z = layers.Dropout(params['dropout_two_rate'])(z)
  z = layers.Dense(1)(z)

  model = keras.models.Model(inputs=[a.input, b.input], outputs=z)

  optimizer = tf.keras.optimizers.Adam(params['opt_learning_rate'])
  model.compile(loss='mae', optimizer=optimizer, metrics=['mae','mse'])

  return model

# **HyperParameter** optimization

In [None]:
def train_fcn(params, verbose=0):  
  model = build_and_compile_model(params)
  if verbose == 1: model.summary()
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
  history = model.fit(
      input_setup(train_df), train_labels, 
      epochs=200,
      # Calculate validation results on 20% of the training data
      validation_split = 0.2,
      callbacks=[early_stopping], verbose=verbose)
  return model

In [None]:
def test_fcn(model):
  loss, mae, mse = model.evaluate(input_setup(test_df), test_labels, verbose=0)
  return mae

In [None]:
def hyperopt_fcn(params):
  model = train_fcn(params)
  mae = test_fcn(model)
  K.clear_session()
  return {'loss': mae, 'status': STATUS_OK}

**Optimize** the **model**, while keeping track of the best results with a trials file

In [None]:
trials_folder = './Trials'
model_name = 'kfe_model'

In [None]:
def run_trials():

    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 11  # initial max_trials. > N because of the initial random iterations of fmin ( N = 16 (default is 20))

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open(trials_folder + model_name + ".hyperopt", "rb"))
        print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(hyperopt_fcn, search_space, algo=partial(tpe.suggest, n_startup_jobs=10), max_evals=max_trials, trials=trials)

    print("Best:", best)
    
    # save the trials object
    with open(trials_folder + model_name + ".hyperopt", "wb") as f:
        pickle.dump(trials, f)

In [None]:
while True:
  run_trials()

**Load** best HyperParameters

In [None]:
def unpack_hyperopt_vals(vals):
    """
    Unpack values from a hyperopt return dictionary where values are wrapped in a list.
    :param vals: dict
    :return: dict
        copy of the dictionary with unpacked values
    """
    assert isinstance(vals, dict), "Parameter must be given as dict."
    ret = {}
    for k, v in list(vals.items()):
        try:
            ret[k] = v[0]
        except (TypeError, IndexError):
            ret[k] = v
    return ret

In [None]:
trials = pickle.load(open(trials_folder + model_name + ".hyperopt", "rb"))
best = unpack_hyperopt_vals(trials.best_trial.get('misc').get('vals'))
opt_params = space_eval(search_space, best)
opt_params

# **Train** the final model

In [None]:
min_mae = 3.80
while True:
  model = train_fcn(opt_params)
  mae = test_fcn(model)
  print(mae)

  if mae < min_mae:
    model.save("./Model/KF_Eval_model")
    min_mae = mae

  if (mae <= 3.68): break

In [None]:
model = tf.keras.models.load_model("./Model/KF_Eval_model")

Visualize the **loss** and **val_loss** of the model, for tesing porpouses

In [None]:
loss, mae, mse = model.evaluate(input_setup(test_df), test_labels, verbose=0)
print("Mean Abs Error: {:5.2f} SAS points".format(mae))

# **Testing** the model

**Result** visualization

In [None]:
test_predictions = model.predict(input_setup(test_df)).flatten()
sampled_predictions = model.predict(input_setup(test_df.sample(1000, random_state=420)))
sampled_labels = test_labels.sample(1000, random_state=420)

correct = 0
for truth, prediction in zip(test_labels.to_numpy(), test_predictions):
  if truth in range(int(prediction-5), int(prediction+6)):
    correct+=1
print("Accuracy (-5;+5): {:5.2f}\n".format(correct/len(test_labels)))

plt.rcParams['figure.figsize'] = [16, 12]
plt.rcParams['figure.dpi'] = 100 

plt.scatter(sampled_labels, sampled_predictions)
plt.xlabel('True Values [SAS]')
plt.ylabel('Predictions [SAS]')
plt.axis('equal')
plt.axis('square')
plt.xlim([35, plt.xlim()[1]])
plt.ylim([35, plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])

**Error** visualization

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [SAS]")
_ = plt.ylabel("Count")

**Error** distribution

In [None]:
sampled_error = error.sample(1000, random_state=420)

plt.scatter(sampled_labels, sampled_error)
plt.axhline(y=10)
plt.axhline(y=5)
plt.axhline(y=-5)
plt.axhline(y=-10)
plt.xlabel('True Values [SAS]')
plt.ylabel('Error [SAS]')
plt.axis('equal')
plt.axis('square')

# **Accuracy** stats

**Accuracy** based on range percentages with 10 **categories**

In [None]:
category_dfs = [pd.DataFrame(columns=test_df.columns),pd.DataFrame(columns=test_df.columns),pd.DataFrame(columns=test_df.columns),pd.DataFrame(columns=test_df.columns),
                pd.DataFrame(columns=test_df.columns),pd.DataFrame(columns=test_df.columns),pd.DataFrame(columns=test_df.columns),pd.DataFrame(columns=test_df.columns),
                pd.DataFrame(columns=test_df.columns),pd.DataFrame(columns=test_df.columns)]
category_truths = [[], [], [], [], [], [], [], [], [], []]

res_test_labels = test_labels.reset_index(drop=True)
res_test_df = test_df.reset_index(drop=True)

for i, sas in res_test_labels.iteritems():
  if sas <= 42:
    category_dfs[0] = category_dfs[0].append(res_test_df.iloc[i])
    category_truths[0].append(sas)
  elif sas in range(43,49):
    category_dfs[1] = category_dfs[1].append(res_test_df.iloc[i])
    category_truths[1].append(sas)
  elif sas in range(49,55):
    category_dfs[2] = category_dfs[2].append(res_test_df.iloc[i])
    category_truths[2].append(sas)
  elif sas in range(55,61):
    category_dfs[3] = category_dfs[3].append(res_test_df.iloc[i])
    category_truths[3].append(sas)
  elif sas in range(61,67):
    category_dfs[4] = category_dfs[4].append(res_test_df.iloc[i])
    category_truths[4].append(sas)
  elif sas in range(67,73):
    category_dfs[5] = category_dfs[5].append(res_test_df.iloc[i])
    category_truths[5].append(sas)
  elif sas in range(73,79):
    category_dfs[6] = category_dfs[6].append(res_test_df.iloc[i])
    category_truths[6].append(sas)
  elif sas in range(79,85):
    category_dfs[7] = category_dfs[7].append(res_test_df.iloc[i])
    category_truths[7].append(sas)
  elif sas in range(85,91):
    category_dfs[8] = category_dfs[8].append(res_test_df.iloc[i])
    category_truths[8].append(sas)
  elif sas >= 91:
    category_dfs[9] = category_dfs[9].append(res_test_df.iloc[i])
    category_truths[9].append(sas)

print("Deck range (-5;+5) accuracies:")
for i, df in enumerate(category_dfs):
  if (df.empty): continue
  res = model.predict(input_setup(df)).flatten()
  correct = 0
  for idx, truth in enumerate(category_truths[i]):
    if truth in range(int(res[idx]-5), int(res[idx]+6)):
      correct+=1
  print("Category "+str(i+1)+": "+str(correct/len(category_truths[i])))


**Confusion matrix** using 10 **categories**

In [None]:
def categorize_sas(sas):
  if sas <= 42:
    return 1
  elif sas in range(43,49):
    return 2
  elif sas in range(49,55):
    return 3
  elif sas in range(55,61):
    return 4
  elif sas in range(61,67):
    return 5
  elif sas in range(67,73):
    return 6
  elif sas in range(73,79):
    return 7
  elif sas in range(79,85):
    return 8
  elif sas in range(85,91):
    return 9
  elif sas >= 91:
    return 10

In [None]:
res_test_labels = test_labels.reset_index(drop=True)
res_test_df = test_df.reset_index(drop=True)

test_category_labels = res_test_labels.apply(categorize_sas).tolist()
test_predictions = model.predict(input_setup(res_test_df)).flatten().tolist()
test_category_predictions = [categorize_sas(int(p)) for p in test_predictions]

data = {
  'y_Actual':    test_category_labels,
  'y_Predicted': test_category_predictions 
}

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix.insert(loc=0, column='1', value=[0,0,0,0,0,0,0,0,0,0])
confusion_matrix['10'] = [0,0,0,0,0,0,0,0,0,0]

plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams['figure.dpi'] = 100 
plt.rc('axes', labelsize=15)    # fontsize of the x and y labels

sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='viridis')
plt.show()

confusion_matrix = confusion_matrix.apply(lambda x: x/x.sum(), axis = 1)
sns.heatmap(confusion_matrix, annot=True, fmt='.2g', cmap='viridis')
plt.show()