In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [101]:
import os
import sys
import gc
import glob
import joblib
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import tensorflow_recommenders as tfrs

from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda, Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import LambdaCallback, EarlyStopping, Callback
from tensorflow.keras.utils import plot_model

# 0. Data Load

In [4]:
train = pd.read_json(os.path.join('data', 'user_train_test', 'user_train_v2.json'))

In [6]:
test = pd.read_json(os.path.join('data', 'user_train_test', 'user_test_v2.json'))

In [7]:
train.head()

Unnamed: 0,user_note,rating_per_user,vintage_id,user_like_count,userID,wine_id,wine_name,url,like
0,,4.0,142514980,0,19484511,2532733,Secateurs Red Blendn2016,/badenhorst-family-wines-secateurs-red-blend-s...,1
1,,3.5,48346487,0,19484511,1253802,Kloof Street Swartland Rougen2015,/mullineux-kloof-street-swartland-rouge/w/1253...,0
2,Beautiful golden nectar,3.5,156865950,1,19484511,1123441,Late Harvestn2018,/stables-late-harvest/w/1123441?year=2018,0
3,,3.5,150284640,0,19484511,1157656,Proseccon2017,/ruffino-prosecco/w/1157656?year=2017,0
4,Bright clean and refreshing,3.5,1471106,0,19484511,1134756,Prosecco TrevisonN.V.,/la-gioiosa-prosecco-treviso/w/1134756?year=N.V.,0


In [8]:
test.head()

Unnamed: 0,user_note,rating_per_user,vintage_id,user_like_count,userID,wine_id,wine_name,url,like
0,Soooo good 💕,4.0,164942680,0,19484511,1141133,Prestige Rosé Brut ChampagnenN.V.,/taittinger-prestige-rose-brut-champagne/w/114...,1
1,"Belíssimo champanhe rose, bem seco mais com mu...",4.0,164942680,2,352674,1141133,Prestige Rosé Brut ChampagnenN.V.,/taittinger-prestige-rose-brut-champagne/w/114...,1
2,"Caramel, cherry, woody and oxidized.",1.5,164942680,8,2148498,1141133,Prestige Rosé Brut ChampagnenN.V.,/taittinger-prestige-rose-brut-champagne/w/114...,0
3,Fine example of a great basic NV. Lovely stuff!,4.5,164942680,0,3450270,1141133,Prestige Rosé Brut ChampagnenN.V.,/taittinger-prestige-rose-brut-champagne/w/114...,1
4,,4.0,164942680,0,17786617,1141133,Prestige Rosé Brut ChampagnenN.V.,/taittinger-prestige-rose-brut-champagne/w/114...,1


In [9]:
item = pd.read_csv('./data/Wine_Meta_final_201208.csv')

In [10]:
item.head()

Unnamed: 0,wine_id,name,rating_count,rating_average,label_count,review_count,type_id,body,acidity_x,alcohol,...,wood smoke,wood varnish,yeast,yellow apple,yellow beet,yellow peach,yellow plum,yellow raisin,yerba mate,yogurt
0,1938520,1882 Cabernet Sauvignon,1697,4.1,14879,16,1,5.0,2.0,14.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14604,Les Bessards Hermitage,1078,4.3,5370,3,1,5.0,3.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1930757,Patriarch Estate Grown,1072,4.6,6042,25,1,4.0,3.0,14.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1564280,Merlot,3577,4.3,18748,52,1,4.0,3.0,14.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2576427,Cabernet Sauvignon F Block,115,4.4,806,1,1,5.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
dataset = joblib.load('dataset_item_pca.pkl')
dataset.keys()

dict_keys(['train', 'test'])

In [21]:
prep_train = dataset['train']
prep_test = dataset['test']

del dataset
gc.collect()

22

In [26]:
train.shape, test.shape, prep_train.shape, prep_test.shape

((937756, 9), (6343, 9), (937756, 58), (6343, 58))

# 1. Preprocessing

In [27]:
# 'like' label 붙이기

In [32]:
prep_train['label'] = train['like']
prep_test['label'] = test['like'].values

In [42]:
feature_names = list(prep_train.columns[3:])

In [44]:
for feature_name in feature_names:
    print(feature_name, prep_train[feature_name].isna().sum())

enc_userID 0
enc_wine_id 0
recent 0
like 0
dislike 0
pca_1 47
pca_2 47
pca_3 47
pca_4 47
pca_5 47
pca_6 47
pca_7 47
pca_8 47
pca_9 47
pca_10 47
pca_11 47
pca_12 47
pca_13 47
pca_14 47
pca_15 47
pca_16 47
pca_17 47
pca_18 47
pca_19 47
pca_20 47
pca_21 47
pca_22 47
pca_23 47
pca_24 47
pca_25 47
pca_26 47
pca_27 47
pca_28 47
pca_29 47
pca_30 47
pca_31 47
pca_32 47
pca_33 47
pca_34 47
pca_35 47
pca_36 47
pca_37 47
pca_38 47
pca_39 47
pca_40 47
pca_41 47
pca_42 47
pca_43 47
pca_44 47
pca_45 47
pca_46 47
pca_47 47
pca_48 47
pca_49 47
pca_50 47
label 0


In [53]:
isna_value = prep_train.loc[prep_train['wine_id'] == 1183966].iloc[:, 8: -1].values[0]

In [72]:
pca_columns = [ 'pca_{}'.format(i) for i in range(1, 51)]

In [79]:
isna_index = prep_train.loc[prep_train['wine_id'] == 1886805].index

for idx in isna_index:
    prep_train._set_value(idx, pca_columns, isna_value)

In [80]:
prep_train.loc[prep_train['wine_id'] == 1886805].head()

Unnamed: 0,userID,wine_id,rating,enc_userID,enc_wine_id,recent,like,dislike,pca_1,pca_2,...,pca_42,pca_43,pca_44,pca_45,pca_46,pca_47,pca_48,pca_49,pca_50,label
5168,15409535,1886805,3.5,18,3924,"[3925, 1553, 3926, 3927, 3928, 3929, 3100, 393...","[3254, 2182, 3951, 3958, 3970, 3897, 3995, 400...","[3935, 3948, 3949, 991, 3954, 3961, 3964, 3967...",-111853.789784,32003.295484,...,12.939464,-0.536548,15.232192,3.187089,15.694273,13.106459,-5.07297,4.71226,-3.853703,0
120521,643840,1886805,4.0,466,3924,"[18041, 19996, 2520, 27649, 33812, 34386, 1512...","[18041, 19996, 2520, 15125, 4709, 19271, 28926...","[34386, 2997, 34388, 3922, 14182, 3091, 10438,...",-111853.789784,32003.295484,...,12.939464,-0.536548,15.232192,3.187089,15.694273,13.106459,-5.07297,4.71226,-3.853703,0
140564,13358262,1886805,3.0,545,3924,"[17669, 2548, 8819, 3537, 3537, 6872, 20193, 3...","[2548, 3537, 3537, 20193, 22192, 36916, 19289,...","[3924, 36911, 7917, 36912, 36913, 4917, 44, 18...",-111853.789784,32003.295484,...,12.939464,-0.536548,15.232192,3.187089,15.694273,13.106459,-5.07297,4.71226,-3.853703,0
144555,12936090,1886805,2.5,560,3924,"[6879, 3092, 6552, 12940, 6880, 3846, 3695, 33...","[6880, 20201, 2514, 7928, 37415, 22940, 19443,...","[3924, 6879, 6552, 12940, 26679, 5886, 19697, ...",-111853.789784,32003.295484,...,12.939464,-0.536548,15.232192,3.187089,15.694273,13.106459,-5.07297,4.71226,-3.853703,0
173278,2301477,1886805,3.5,674,3924,"[4089, 4252, 33914, 2805, 3742, 3660, 13173, 3...","[4252, 3742, 3660, 13173, 1605, 17558, 14178, ...","[5412, 2320, 3937, 33387, 7219, 9884, 3363, 10...",-111853.789784,32003.295484,...,12.939464,-0.536548,15.232192,3.187089,15.694273,13.106459,-5.07297,4.71226,-3.853703,0


In [86]:
for feature_name in feature_names:
    print(feature_name, prep_test[feature_name].isna().sum())

enc_userID 0
enc_wine_id 0
recent 0
like 0
dislike 0
pca_1 0
pca_2 0
pca_3 0
pca_4 0
pca_5 0
pca_6 0
pca_7 0
pca_8 0
pca_9 0
pca_10 0
pca_11 0
pca_12 0
pca_13 0
pca_14 0
pca_15 0
pca_16 0
pca_17 0
pca_18 0
pca_19 0
pca_20 0
pca_21 0
pca_22 0
pca_23 0
pca_24 0
pca_25 0
pca_26 0
pca_27 0
pca_28 0
pca_29 0
pca_30 0
pca_31 0
pca_32 0
pca_33 0
pca_34 0
pca_35 0
pca_36 0
pca_37 0
pca_38 0
pca_39 0
pca_40 0
pca_41 0
pca_42 0
pca_43 0
pca_44 0
pca_45 0
pca_46 0
pca_47 0
pca_48 0
pca_49 0
pca_50 0
label 0


In [87]:
prep_train.columns

Index(['userID', 'wine_id', 'rating', 'enc_userID', 'enc_wine_id', 'recent',
       'like', 'dislike', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6',
       'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13',
       'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20',
       'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27',
       'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34',
       'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_40', 'pca_41',
       'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48',
       'pca_49', 'pca_50', 'label'],
      dtype='object')

In [153]:
str_features = ["enc_userID", "enc_wine_id", "recent", 'like', 'dislike', 'label']
int_features = pca_columns

In [186]:
train_str_dict = {
    str_feature: [str(val).encode() for val in prep_train[str_feature].values]
    for str_feature in str_features
}

# train_int_dict = {
#     int_feature: prep_train[int_feature].values
#     for int_feature in int_features
# }

In [187]:
# train_str_dict.update(train_int_dict)
train_str_dict.keys()

dict_keys(['enc_userID', 'enc_wine_id', 'recent', 'like', 'dislike', 'label'])

In [188]:
test_str_dict = {
    str_feature: [str(val).encode() for val in prep_test[str_feature].values]
    for str_feature in str_features
}

test_int_dict = {
    int_feature: prep_test[int_feature].values
    for int_feature in int_features
}

# test_str_dict.update(test_int_dict)
test_str_dict.keys()

dict_keys(['enc_userID', 'enc_wine_id', 'recent', 'like', 'dislike', 'label'])

In [189]:
tensor_train = tf.data.Dataset.from_tensor_slices(train_str_dict)
tensor_test = tf.data.Dataset.from_tensor_slices(test_str_dict)

In [194]:
tensor_train

<TensorSliceDataset shapes: {enc_userID: (), enc_wine_id: (), recent: (), like: (), dislike: (), label: ()}, types: {enc_userID: tf.string, enc_wine_id: tf.string, recent: tf.string, like: tf.string, dislike: tf.string, label: tf.string}>

In [113]:
vocabularies = {}

for feature_name in tqdm(feature_names):
    vocab = tensor_train.batch(1_000_000).map(lambda x: x[feature_name])
    vocabularies[feature_name] = np.unique(np.concatenate(list(vocab)))


  0%|          | 0/56 [00:00<?, ?it/s][A
  2%|▏         | 1/56 [00:51<47:10, 51.47s/it][A
  4%|▎         | 2/56 [01:42<46:07, 51.26s/it][A
  5%|▌         | 3/56 [02:33<45:13, 51.19s/it][A
  7%|▋         | 4/56 [03:22<43:54, 50.66s/it][A
  9%|▉         | 5/56 [04:12<42:47, 50.33s/it][A
 11%|█         | 6/56 [05:02<41:50, 50.21s/it][A
 12%|█▎        | 7/56 [05:50<40:37, 49.74s/it][A
 14%|█▍        | 8/56 [06:44<40:47, 50.99s/it][A
 16%|█▌        | 9/56 [07:37<40:21, 51.53s/it][A
 18%|█▊        | 10/56 [08:29<39:33, 51.59s/it][A
 20%|█▉        | 11/56 [09:19<38:24, 51.21s/it][A
 21%|██▏       | 12/56 [10:11<37:36, 51.29s/it][A
 23%|██▎       | 13/56 [11:00<36:15, 50.59s/it][A
 25%|██▌       | 14/56 [11:50<35:19, 50.47s/it][A
 27%|██▋       | 15/56 [12:40<34:22, 50.31s/it][A
 29%|██▊       | 16/56 [13:27<32:57, 49.44s/it][A
 30%|███       | 17/56 [14:17<32:11, 49.51s/it][A
 32%|███▏      | 18/56 [15:05<31:10, 49.23s/it][A
 34%|███▍      | 19/56 [15:54<30:11, 48.97s/it]

KeyError: in user code:

    <ipython-input-97-d37115fb2f45>:4 None  *
        lambda x: x[feature_name])

    KeyError: 'label'


In [193]:
vocabularies.keys()

dict_keys(['enc_userID', 'enc_wine_id', 'recent', 'like', 'dislike', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_40', 'pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_50', 'label'])

In [160]:
label_dict = {}
label_dict['label'] = np.array([0,1])
label_dict

{'label': array([0, 1])}

In [161]:
vocabularies.update(label_dict)

In [162]:
vocabularies.keys()

dict_keys(['enc_userID', 'enc_wine_id', 'recent', 'like', 'dislike', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_40', 'pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_50', 'label'])

In [207]:
for i in range(1, 51):
    del vocabularies['pca_{}'.format(i)]

In [208]:
vocabularies

{'enc_userID': array([b'0', b'1', b'10', ..., b'997', b'998', b'999'], dtype=object),
 'enc_wine_id': array([b'0', b'1', b'10', ..., b'9997', b'9998', b'9999'], dtype=object),
 'recent': array([b'[0, 10559, 34593, 3101, 34925, 10460, 643, 23106, 49764, 76]',
        b'[0, 11364, 26590, 10510, 18922, 3588, 1041, 21797, 817, 25264]',
        b'[0, 11537, 46496, 1632, 19973, 33741, 20280, 34011, 14916, 41713]',
        ...,
        b'[9999, 9999, 8904, 11183, 6072, 16258, 19029, 242, 22127, 40771]',
        b'[9999]', b'[]'], dtype=object),
 'like': array([b'[0, 7015, 14999, 9101, 46640, 15441, 11071, 14187, 461, 25334]',
        b'[1, 17889, 16332, 13516, 28520, 9728, 12891, 8930, 47784, 1016]',
        b'[1, 25678, 25944, 1513, 6730, 16549, 19269, 19051, 12583, 21715]',
        ..., b'[9999]', b'[99]', b'[]'], dtype=object),
 'dislike': array([b'[0, 1010, 26076, 19312, 14126, 19824, 7872, 15863, 22354, 13118]',
        b'[0, 11307, 10033, 11087, 20593, 6568, 4469, 26118, 13022, 38314]',

In [163]:
cached_train = tensor_train.shuffle(100_000).batch(8192).cache()
cached_test = tensor_test.batch(4096).cache()

In [164]:
def run_models(use_cross_layer, deep_layer_sizes, projection_dim=None, num_runs=5):
    models = []
    rmses = []

    for i in range(num_runs):
        model = DCN(use_cross_layer=use_cross_layer,
                    deep_layer_sizes=deep_layer_sizes,
                    projection_dim=projection_dim)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
        models.append(model)

        model.fit(cached_train, epochs=epochs, verbose=False)
        metrics = model.evaluate(cached_test, return_dict=True)
        rmses.append(metrics["RMSE"])

    mean, stdv = np.average(rmses), np.std(rmses)

    return {"model": models, "mean": mean, "stdv": stdv}

In [165]:
epochs = 5
learning_rate = 0.01

In [216]:
class DCN(tfrs.Model):
    
    def __init__(self, use_cross_layer, deep_layer_sizes, projection_dim=None):
        super().__init__()

        self.embedding_dimension = 32

        str_features = ['enc_userID', 'enc_wine_id', 'recent', 'like', 'dislike']
        str_features = ['enc_userID', 'enc_wine_id']
#         int_features = ['pca_{}'.format(i) for i in range(1, 51)]
        int_features = []

        self._all_features = str_features + int_features
        self._embeddings = {}

        # Compute embeddings for string features.
        for feature_name in str_features:
            vocabulary = vocabularies[feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential(
                [tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=vocabulary, mask_token=None),
                 tf.keras.layers.Embedding(len(vocabulary) + 1,
                                           self.embedding_dimension)
                ])

        # Compute embeddings for int features.
        for feature_name in int_features:
            vocabulary = vocabularies[feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential(
              [tf.keras.layers.experimental.preprocessing.IntegerLookup(
                  vocabulary=vocabulary, mask_value=None),
               tf.keras.layers.Embedding(len(vocabulary) + 1,
                                         self.embedding_dimension)
              ])

        if use_cross_layer:
            self._cross_layer = tfrs.layers.dcn.Cross(
              projection_dim=projection_dim,
              kernel_initializer="glorot_uniform")
        else:
            self._cross_layer = None

        self._deep_layers = [tf.keras.layers.Dense(layer_size, activation="relu")
          for layer_size in deep_layer_sizes]

        self._logit_layer = tf.keras.layers.Dense(1)

        self.task = tfrs.tasks.Ranking(
          loss=tf.keras.losses.MeanSquaredError(),
#             loss = tf.keras.losses.BinaryCrossentropy(),
          metrics = [tf.keras.metrics.RootMeanSquaredError("RMSE")]
#             metrics = ['acc']
        )

    def call(self, features):
        # Concatenate embeddings
        embeddings = []
        for feature_name in self._all_features:
            embedding_fn = self._embeddings[feature_name]
            embeddings.append(embedding_fn(features[feature_name]))

        x = tf.concat(embeddings, axis=1)

        # Build Cross Network
        if self._cross_layer is not None:
            x = self._cross_layer(x)

        # Build Deep Network
        for deep_layer in self._deep_layers:
            x = deep_layer(x)

        return self._logit_layer(x)

    def compute_loss(self, features, training=False):
        labels = features.pop("label")
        scores = self(features)
        return self.task(
            labels=labels,
            predictions=scores,
        )

In [217]:
model = DCN(use_cross_layer=None, deep_layer_sizes=[192, 192], projection_dim=None)

In [218]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))

In [219]:
model.fit(cached_train,  epochs=10, verbose=True)

Epoch 1/10


UnimplementedError:  Cast string to float is not supported
	 [[node ranking_12/mean_squared_error/Cast (defined at /Users/jinseok/opt/anaconda3/lib/python3.7/site-packages/tensorflow_recommenders/tasks/ranking.py:86) ]] [Op:__inference_train_function_26934]

Errors may have originated from an input operation.
Input Source operations connected to node ranking_12/mean_squared_error/Cast:
 IteratorGetNext (defined at <ipython-input-219-5dc611e75305>:1)

Function call stack:
train_function
