In [1]:
from collections import Counter
from itertools import combinations
from math import sqrt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" #comment out this line if you want to use gpu
import random
from keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from keras.models import Model
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import tensorflow

### Set random seed

In [2]:
import random
random.seed(2021)
np.random.seed(2021)
# tf > 2.0
tensorflow.random.set_seed(2021)
#tf < 2.0
#tf.set_random_seed(2021)

### Root Mean Squared Error (RMSE) is used to evaluate the performance of a recommendation algorithm, so we need to define the following utility function to compute the RMSE given the predicted ratings and the ground truth ratings. 

In [3]:
'''
params:
    -pred: an array containing all predicted ratings
    -actual: an array containing all ground truth ratings
    
return:
    a scalar whose value is the rmse
'''
def rmse(pred, actual):
    # Ignore ratings with value zero.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

# Implement Wide and Deep Learning (WDL) Model

### The wide component is a generalized linear model that takes in the raw input features and the cross-product transformation of categorical features, which enables it to learn the frequent co-occurrence of items or features. 

### The deep component is a Feed-forward Neural Network (FNN) which takes in both continuous and categorical features as input. Specifically,  the normalized values of continuous features are concatenated with the low-dimensional dense embedding vectors converted from categorical features. This concatenated vector is then fed into the FNN during each foward pass. This mechanism tend to increase the diversity of recommendations.

In [4]:
'''
params:
    -len_continuous: number of continuous features
    -deep_vocab_lens: an array of integers where deep_vocab_lens[i] represents the number of unique values of (i+1)-th deep 
        categorical feature
    -len_wide: number of wide features
    -embed_size: dimension of the embedding vectors of deep categorical features
    
return:
    a keras Model object for the constructed wdl model 
'''


def build_wdl_model(len_continuous, deep_vocab_lens, len_wide, embed_size):
    # A list containing all input layers
    input_list = []
    
    # Input layer for continuous features
    continuous_input = Input(shape=(len_continuous,), dtype='float32', name='continuous_input')
    input_list.append(continuous_input)
    
    
    # Get embeddings for all deep categorical features
    emb_list = []
    for vocab_size in deep_vocab_lens:
        _input = Input(shape=(1,), dtype='int32')
        input_list.append(_input)
        _emb = Embedding(output_dim=embed_size, input_dim=vocab_size, input_length=1)(_input)
        _emb = Reshape((embed_size,))(_emb)
        emb_list.append(_emb)
    
    
   
    # Create input layer for deep component by concatenating the embeddings and continuous features' input layer
    deep_input = Concatenate()(emb_list + [continuous_input])
    

    # Construct deep component
    dense_1 = Dense(256, activation='relu')(deep_input)
    dense_1_dp = Dropout(0.3)(dense_1)
    dense_2 = Dense(128, activation='relu')(dense_1_dp)
    dense_2_dp = Dropout(0.3)(dense_2)
    dense_3 = Dense(64, activation='relu')(dense_2_dp)
    dense_3_dp = Dropout(0.3)(dense_3)

    
    # Create input layer for wide component
    wide_input = Input(shape=(len_wide,), dtype='float32')
    input_list.append(wide_input)

    
    # Concatenate the outputs of deep and wide components and feed the 
    # concatenated vector into the finall fully connected layer
    fc_input = Concatenate()([dense_3_dp, wide_input])
    model_output = Dense(1)(fc_input)
    
    model = Model(inputs=input_list,
                  outputs=model_output)
    return model
    

# Utility functions to get the values of different types of features

### Continuous features

In [5]:
'''
params:
    -df: input dataframe
    -continuous_columns: column names of continuous features
    
return: 
    a numpy array where each row contains the values of continuous features in the corresponding row of the
    input dataframe
'''
def get_continuous_features(df, continuous_columns):
    continuous_features = df[continuous_columns].values
    return continuous_features

### Cross product transformation of categorical features

In [6]:
'''
params:
    -df: input dataframe
    -comb_p: number of elements in each combination (e.g., there are two elements in the combination {fried chicken, chicken and 
    waffle}, and three elements in the combination {fried chicken, chicken and waffle, chicken fried rice})
    -topk: number of mostly frequent combinations to retrieve
    -output_freq: whether to return the frequencies of retrieved combinations
    
return:
    1. output_freq = True: a list X where each element is a tuple containing a combinantion tuple and corresponding frequency, and the 
        elements are stored in the descending order of their frequencies
    2. output_freq = False: a list X where each element is a tuple containing a combinantion tuple, and the elements are stored in 
    the descending order of their frequencies
'''
def get_top_k_p_combinations(df, comb_p, topk, output_freq=False):
    # get all combinations with comb_p
    def get_category_combinations(categories_str, comb_p=2):
        categories = categories_str.split(', ')
        return list(combinations(categories, comb_p))
    # [('Lounges', 'Dance Clubs'), ('Lounges', 'Bars'), ('Lounges', 'Nightlife'), ('Dance Clubs', 'Bars'), ('Dance Clubs', 'Nightlife'), ('Bars', 'Nightlife')]
    all_categories_p_combos = df["item_categories"].apply(
        lambda x: get_category_combinations(x, comb_p)).values.tolist()
    # ('Lounges', 'Dance Clubs')
    # list of tuples that each index refer to one combination
    all_categories_p_combos = [tuple(t) for item in all_categories_p_combos for t in item]

    tmp = dict(Counter(all_categories_p_combos))
    sorted_categories_combinations = list(sorted(tmp.items(), key=lambda x: x[1], reverse=True))
    if output_freq:
        return sorted_categories_combinations[:topk]
    else:
        return [t[0] for t in sorted_categories_combinations[:topk]]

### Wide features

In [7]:
'''
params:
    -df: input dataframe
    -selected_categories_to_idx: a dictionary mapping item categories to corrresponding integral indices
    -top_combinations: a list containing retrieved mostly frequent combinantions of item categories
    
return:
    a numpy array where each row contains the categorical features' binary encodings and cross product
    transformations for the corresponding row of the input dataframe
'''

def get_wide_features(df, selected_categories_to_idx, top_combinations):
    def categories_to_binary_output(categories):
        binary_output = [0 for _ in range(len(selected_categories_to_idx))]
        for category in categories.split(', '):
            if category in selected_categories_to_idx:
                binary_output[selected_categories_to_idx[category]] = 1
            else:
                binary_output[0] = 1
        return binary_output
    def categories_cross_transformation(categories):
        current_category_set = set(categories.split(', '))
        corss_transform_output = [0 for _ in range(len(top_combinations))]
        for k, comb_k in enumerate(top_combinations):
            if len(current_category_set & comb_k) == len(comb_k):
                corss_transform_output[k] = 1
            else:
                corss_transform_output[k] = 0
        return corss_transform_output

    category_binary_features = np.array(df.item_categories.apply(
        lambda x: categories_to_binary_output(x)).values.tolist())
    print('category_binary_features shape:',category_binary_features.shape)
    category_corss_transform_features = np.array(df.item_categories.apply(
        lambda x: categories_cross_transformation(x)).values.tolist())
    print('category_cross_features shape:',category_corss_transform_features.shape)
    out = np.concatenate((category_binary_features, category_corss_transform_features), axis=1)
    print('wide features shape:',out.shape)
    return np.concatenate((category_binary_features, category_corss_transform_features), axis=1)


# Rating Prediction

### Load train, validation and test rating tables

In [8]:
tr_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/valid.csv")
te_df = pd.read_csv("data/test.csv")

tr_ratings = tr_df.stars.values
val_ratings = val_df.stars.values



### Load content feautures tables of users and items

In [9]:
user_df = pd.read_csv("data/user.csv")
item_df = pd.read_csv("data/business.csv")

# Rename some columns of dfs and convert the indices of dfs into string type for easier reference in later stage 
user_df = user_df.rename(index=str, columns={t: 'user_' + t for t in user_df.columns if t != 'user_id'})
item_df = item_df.rename(index=str, columns={t: 'item_' + t for t in item_df.columns if t != 'business_id'})


### Associate each row in the rating tables with corresponding user's and item's content features through merging the rating tables and content features tables

In [10]:
# Save the original row indices of each rating table
tr_df["index"] = tr_df.index
val_df["index"]  = val_df.index
te_df["index"] = te_df.index

tr_df = pd.merge(pd.merge(tr_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
val_df = pd.merge(pd.merge(val_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
te_df = pd.merge(pd.merge(te_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)



### Prepare continuous features

In [11]:
# Specify the columns containing conitnuous features
continuous_columns = ["user_average_stars", "user_cool", "user_fans", 
                      "user_review_count", "user_useful", "user_funny",
                      "item_is_open", "item_latitude", "item_longitude", 
                      "item_review_count", "item_stars"]

# Get values of continous features for train/validation/test sets using the utility function defined previously

tr_continuous_features = get_continuous_features(tr_df, continuous_columns)
val_continuous_features = get_continuous_features(val_df, continuous_columns)
te_continuous_features = get_continuous_features(te_df, continuous_columns)

# Standardize each feature by removing the mean of the training samples and scaling to unit variance.
# See https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html for more details.
scaler = StandardScaler().fit(tr_continuous_features)

tr_continuous_features = scaler.transform(tr_continuous_features)
val_continuous_features = scaler.transform(val_continuous_features)
te_continuous_features = scaler.transform(te_continuous_features)

### Prepare deep categorical features

In [12]:
# Sepcify column names of deep categorical features
item_deep_columns = ["item_city", "item_postal_code", "item_state"]

# An array of integers where deep_vocab_lens[i] represents the number of unique values of (i+1)-th deep categorical feature
item_deep_vocab_lens = []

for col_name in item_deep_columns:
    # Get all unique values of this deep categorical feature
    tmp = item_df[col_name].unique()
    
    # Create a dictionary mapping each unique value to a unique integral index
    vocab = dict(zip(tmp, range(1, len(tmp) + 1)))
    
    # Get the number of unique values of this deep categorical features
    item_deep_vocab_lens.append(len(vocab) + 1)
    
    # Create a new column where each entry stores the integral index of this deep categorical feature's value in the same row
    item_df[col_name + "_idx"] = item_df[col_name].apply(lambda x: vocab[x])


# Create a dictionary mapping each business id to corresponding values of deep categorical features
item_deep_idx_columns = [t + "_idx" for t in item_deep_columns]
item_to_deep_categorical_features = dict(zip(item_df.business_id.values, item_df[item_deep_idx_columns].values.tolist()))

# Creat numpy arrays storing corresponding deep categorical features' values of train/validation/test sets using the above mapping
tr_deep_categorical_features = np.array(tr_df.business_id.apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())
val_deep_categorical_features = np.array(val_df.business_id.apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())
te_deep_categorical_features = np.array(te_df.business_id.apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())


### Prepare wide features

##### Prepare binary encoding for each selected category

In [13]:
# Collect the categories of all items 
all_categories = [category for category_list in item_df.item_categories.values for category in category_list.split(", ")]

# Sort all unique values of the item categories by their frequencies in descending order
category_sorted = sorted(Counter(all_categories).items(), key=lambda x: x[1], reverse=True)

# Select top 500 most frequent categories
selected_categories = [t[0] for t in category_sorted[:500]]

# Create a dictionary mapping each secleted category to a unique integral index
selected_categories_to_idx = dict(zip(selected_categories, range(1, len(selected_categories) + 1)))

# Map all categories unseen in the item df to index 0
selected_categories_to_idx['unk'] = 0

# Create a dictionary mapping each integral index to corresponding category
idx_to_selected_categories = {val: key for key, val in selected_categories_to_idx.items()}


##### Prepare cross product transformation for categories

In [14]:
# Get most frequent categories combinantions using the utility function defined previously and store them in the folloing list
top_combinations = []

# Get top 50 most frequent two-categories combinantions in the train set

top_combinations += get_top_k_p_combinations(tr_df, 2, 50, output_freq=False)

# Get top 30 most frequent three-categories combinantions in the train set
top_combinations += get_top_k_p_combinations(tr_df, 3, 30, output_freq=False)

# Get top 20 most frequent four-categories combinantions in the train set
top_combinations += get_top_k_p_combinations(tr_df, 4, 20, output_freq=False)

# Convert each combinantion in the list to a set data structure
top_combinations = [set(t) for t in top_combinations]

In [15]:
# Get values of wide features for train/validation/test sets using the utility function defined previously

tr_wide_features = get_wide_features(tr_df, selected_categories_to_idx, top_combinations)
val_wide_features = get_wide_features(val_df, selected_categories_to_idx, top_combinations)
te_wide_features = get_wide_features(te_df, selected_categories_to_idx, top_combinations)


category_binary_features shape: (100000, 501)
category_cross_features shape: (100000, 100)
wide features shape: (100000, 601)
category_binary_features shape: (10000, 501)
category_cross_features shape: (10000, 100)
wide features shape: (10000, 601)
category_binary_features shape: (10000, 501)
category_cross_features shape: (10000, 100)
wide features shape: (10000, 601)


### Build the input list for each of the train/validation/test sets through aggregating all continuous, deep categorical and wide features


In [16]:
# tr_features = [tr_continuous_features,categorical_features_0,categorical_features_1,categorical_features_2,tr_wide_features]
tr_features = []
tr_features.append(tr_continuous_features.tolist())
tr_features += [tr_deep_categorical_features[:,i].tolist() for i in range(tr_deep_categorical_features.shape[1])]
tr_features.append(tr_wide_features.tolist())



val_features = []
val_features.append(val_continuous_features.tolist())
val_features += [val_deep_categorical_features[:,i].tolist() for i in range(val_deep_categorical_features.shape[1])]
val_features.append(val_wide_features.tolist())

te_features = []
te_features.append(te_continuous_features.tolist())
te_features += [te_deep_categorical_features[:,i].tolist() for i in range(te_deep_categorical_features.shape[1])]
te_features.append(te_wide_features.tolist())

### Build the WDL model defined above

In [17]:
wdl_model = build_wdl_model(
        len(tr_continuous_features[0]),
        item_deep_vocab_lens,   # num of category classes
        len(tr_wide_features[0]), 
        embed_size=100)
#print(len(tr_continuous_features[0]))
#print(item_deep_vocab_lens)
#print(len(tr_wide_features[0]))

### Train the model using Adagrad optimizer and mean squared error loss

In [18]:
wdl_model.compile(optimizer='adagrad', loss='mse')

history = wdl_model.fit(
        tr_features, 
        tr_ratings, 
        epochs=1, verbose=1, callbacks=[ModelCheckpoint('model.h5')])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


### Evaluate the model on train and validation sets using RMSE¶

In [19]:
y_pred = wdl_model.predict(tr_features)
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = wdl_model.predict(val_features)
print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  1.039801672098216
VALID RMSE:  1.0475014041928454
