In [184]:
from collections import Counter
from itertools import combinations
from math import sqrt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" #comment out this line if you want to use gpu
import random
from keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from keras.models import Model
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import tensorflow

### Set random seed

In [185]:
import random
random.seed(2021)
np.random.seed(2021)
# tf > 2.0
tensorflow.random.set_seed(2021)
#tf < 2.0
#tf.set_random_seed(2021)

### Root Mean Squared Error (RMSE) is used to evaluate the performance of a recommendation algorithm, so we need to define the following utility function to compute the RMSE given the predicted ratings and the ground truth ratings. 

In [186]:
'''
params:
    -pred: an array containing all predicted ratings
    -actual: an array containing all ground truth ratings
    
return:
    a scalar whose value is the rmse
'''
def rmse(pred, actual):
    # Ignore ratings with value zero.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

# Implement Wide and Deep Learning (WDL) Model

### The wide component is a generalized linear model that takes in the raw input features and the cross-product transformation of categorical features, which enables it to learn the frequent co-occurrence of items or features. 

### The deep component is a Feed-forward Neural Network (FNN) which takes in both continuous and categorical features as input. Specifically,  the normalized values of continuous features are concatenated with the low-dimensional dense embedding vectors converted from categorical features. This concatenated vector is then fed into the FNN during each foward pass. This mechanism tend to increase the diversity of recommendations.

In [187]:
'''
params:
    -len_continuous: number of continuous features
    -deep_vocab_lens: an array of integers where deep_vocab_lens[i] represents the number of unique values of (i+1)-th deep 
        categorical feature
    -len_wide: number of wide features
    -embed_size: dimension of the embedding vectors of deep categorical features
    
return:
    a keras Model object for the constructed wdl model 
'''


def build_wdl_model(len_continuous, deep_vocab_lens, len_wide, embed_size):
    # A list containing all input layers
    input_list = []
    
    # Input layer for continuous features
    continuous_input = Input(shape=(len_continuous,), dtype='float32', name='continuous_input')
    input_list.append(continuous_input)
    
    
    # Get embeddings for all deep categorical features
    emb_list = []
    for vocab_size in deep_vocab_lens:
        _input = Input(shape=(1,), dtype='int32')
        input_list.append(_input)
        _emb = Embedding(output_dim=embed_size, input_dim=vocab_size, input_length=1)(_input)
        _emb = Reshape((embed_size,))(_emb)
        emb_list.append(_emb)
    
    
   
    # Create input layer for deep component by concatenating the embeddings and continuous features' input layer
    deep_input = Concatenate()(emb_list + [continuous_input])
    

    # Construct deep component
    dense_1 = Dense(256, activation='relu')(deep_input)
    dense_1_dp = Dropout(0.3)(dense_1)
    dense_2 = Dense(128, activation='relu')(dense_1_dp)
    dense_2_dp = Dropout(0.3)(dense_2)
    dense_3 = Dense(64, activation='relu')(dense_2_dp)
    dense_3_dp = Dropout(0.3)(dense_3)

    
    # Create input layer for wide component
    wide_input = Input(shape=(len_wide,), dtype='float32')
    input_list.append(wide_input)

    
    # Concatenate the outputs of deep and wide components and feed the 
    # concatenated vector into the finall fully connected layer
    fc_input = Concatenate()([dense_3_dp, wide_input])
    model_output = Dense(1)(fc_input)
    
    model = Model(inputs=input_list,
                  outputs=model_output)
    return model
    

# Utility functions to get the values of different types of features

### Continuous features

In [188]:
'''
params:
    -df: input dataframe
    -continuous_columns: column names of continuous features
    
return: 
    a numpy array where each row contains the values of continuous features in the corresponding row of the
    input dataframe
'''
def get_continuous_features(df, continuous_columns):
    continuous_features = df[continuous_columns].values
    return continuous_features

### Cross product transformation of categorical features

In [189]:
'''
params:
    -df: input dataframe
    -comb_p: number of elements in each combination (e.g., there are two elements in the combination {fried chicken, chicken and 
    waffle}, and three elements in the combination {fried chicken, chicken and waffle, chicken fried rice})
    -topk: number of mostly frequent combinations to retrieve
    -output_freq: whether to return the frequencies of retrieved combinations
    
return:
    1. output_freq = True: a list X where each element is a tuple containing a combinantion tuple and corresponding frequency, and the 
        elements are stored in the descending order of their frequencies
    2. output_freq = False: a list X where each element is a tuple containing a combinantion tuple, and the elements are stored in 
    the descending order of their frequencies
'''
def get_top_k_p_combinations(df, comb_p, topk, output_freq=False):
    # get all combinations with comb_p
    def get_category_combinations(categories_str, comb_p=2):
        categories = categories_str.split(', ')
        return list(combinations(categories, comb_p))
    # [('Lounges', 'Dance Clubs'), ('Lounges', 'Bars'), ('Lounges', 'Nightlife'), ('Dance Clubs', 'Bars'), ('Dance Clubs', 'Nightlife'), ('Bars', 'Nightlife')]
    all_categories_p_combos = df["item_categories"].apply(
        lambda x: get_category_combinations(x, comb_p)).values.tolist()
    # ('Lounges', 'Dance Clubs')
    # list of tuples that each index refer to one combination
    all_categories_p_combos = [tuple(t) for item in all_categories_p_combos for t in item]

    tmp = dict(Counter(all_categories_p_combos))
    sorted_categories_combinations = list(sorted(tmp.items(), key=lambda x: x[1], reverse=True))
    if output_freq:
        return sorted_categories_combinations[:topk]
    else:
        return [t[0] for t in sorted_categories_combinations[:topk]]

### Wide features

In [190]:
'''
params:
    -df: input dataframe
    -selected_categories_to_idx: a dictionary mapping item categories to corrresponding integral indices
    -top_combinations: a list containing retrieved mostly frequent combinantions of item categories
    
return:
    a numpy array where each row contains the categorical features' binary encodings and cross product
    transformations for the corresponding row of the input dataframe
'''

def get_wide_features(df, selected_categories_to_idx, top_combinations):
    def categories_to_binary_output(categories):
        binary_output = [0 for _ in range(len(selected_categories_to_idx))]
        for category in categories.split(', '):
            if category in selected_categories_to_idx:
                binary_output[selected_categories_to_idx[category]] = 1
            else:
                binary_output[0] = 1
        return binary_output
    def categories_cross_transformation(categories):
        current_category_set = set(categories.split(', '))
        corss_transform_output = [0 for _ in range(len(top_combinations))]
        for k, comb_k in enumerate(top_combinations):
            if len(current_category_set & comb_k) == len(comb_k):
                corss_transform_output[k] = 1
            else:
                corss_transform_output[k] = 0
        return corss_transform_output

    category_binary_features = np.array(df.item_categories.apply(
        lambda x: categories_to_binary_output(x)).values.tolist())
    print('category_binary_features shape:',category_binary_features.shape)
    category_corss_transform_features = np.array(df.item_categories.apply(
        lambda x: categories_cross_transformation(x)).values.tolist())
    print('category_cross_features shape:',category_corss_transform_features.shape)
    out = np.concatenate((category_binary_features, category_corss_transform_features), axis=1)
    print('wide features shape:',out.shape)
    return np.concatenate((category_binary_features, category_corss_transform_features), axis=1)


# Rating Prediction

### Load train, validation and test rating tables

In [191]:
tr_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/valid.csv")
te_df = pd.read_csv("data/test.csv")

tr_ratings = tr_df.stars.values
val_ratings = val_df.stars.values

### Load content feautures tables of users and items

In [192]:
user_df = pd.read_csv("data/user.csv")
item_df = pd.read_csv("data/business.csv")

# Rename some columns of dfs and convert the indices of dfs into string type for easier reference in later stage 
user_df = user_df.rename(index=str, columns={t: 'user_' + t for t in user_df.columns if t != 'user_id'})
item_df = item_df.rename(index=str, columns={t: 'item_' + t for t in item_df.columns if t != 'business_id'})


### Associate each row in the rating tables with corresponding user's and item's content features through merging the rating tables and content features tables

In [193]:
# Save the original row indices of each rating table
tr_df["index"] = tr_df.index
val_df["index"]  = val_df.index
te_df["index"] = te_df.index

tr_df = pd.merge(pd.merge(tr_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
val_df = pd.merge(pd.merge(val_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)
te_df = pd.merge(pd.merge(te_df, user_df, on='user_id'), item_df, on='business_id').sort_values(by=['index']).reset_index(drop=True)

In [194]:
tr_df

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,user_cool,user_elite,user_fans,user_average_stars,user_compliment_hot,user_compliment_more,user_compliment_profile,user_compliment_cute,user_compliment_list,user_compliment_note,user_compliment_plain,user_compliment_cool,user_compliment_funny,user_compliment_writer,user_compliment_photos,item_Unnamed: 0,item_name,item_address,item_city,item_state,item_postal_code,item_latitude,item_longitude,item_stars,item_review_count,item_is_open,item_attributes,item_categories,item_hours
0,11d83531dcee4af6f5701696ff45a8d8,54eb8c224740ce5a8a1db56324cbf0c0,3.0,0,1666,Jen,75,2007-07-19 00:56:23,150,83,56,,2,3.25,0,0,0,0,0,4,5,2,2,0,1,5188,Four Kegs Sports Bar,"276 N Jones Blvd, Ste B",Las Vegas,NV,89107,36.175904,-115.223262,4.0,441,1,"{'OutdoorSeating': 'False', 'GoodForMeal': ""{'...","American (New), American (Traditional), Sports...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
1,f3270a0bfa9f67f4b00fd9dadbf1d3e4,e398e51ecca29473c80b058ab17e903e,5.0,1,298,Matthew,430,2014-05-27 06:33:22,1713,466,1155,201620172018,73,4.58,26,3,1,0,0,11,47,67,67,21,30,4590,Thai Spices,"66 S Dobson Rd, Ste 133",Mesa,AZ,85202,33.412708,-111.875803,4.0,343,1,"{'DogsAllowed': 'False', 'Alcohol': ""u'full_ba...","Thai, Restaurants","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3..."
2,8515478d756cecae1d46ad274e583fba,62d5a2614594cdfe014cb3f47bc1f183,4.0,2,847,Danielle,66,2013-09-27 21:02:56,186,50,98,2015,16,4.03,12,1,0,0,0,7,15,6,6,9,1,3033,Lakeside,3131 Las Vegas Blvd S,Las Vegas,NV,89109,36.126576,-115.166935,3.5,450,1,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Restaurants, Steakhouses, American (New), Seafood","{'Monday': '17:30-22:0', 'Tuesday': '17:30-22:..."
3,f7a8d4e39387479bee5786d9af7a384e,5e4ad90afb13df5004803bd165a905a9,5.0,3,1153,Lana,247,2013-07-20 22:03:06,363,145,223,2015201620172018,9,3.88,2,0,1,1,0,4,10,10,10,6,1,5268,Saku Sushi,478 Queen Street W,Toronto,ON,M5V 2B2,43.648086,-79.400362,4.0,401,1,"{'HasTV': 'False', 'RestaurantsReservations': ...","Breakfast & Brunch, Sushi Bars, Japanese, Rest...","{'Monday': '12:0-22:30', 'Tuesday': '12:0-23:3..."
4,edbc6b267c51aaa5f89ec35237126649,cff931388a0021c662fa8d3437448a89,2.0,4,368,G,113,2012-04-11 07:50:10,53,11,17,,0,3.68,0,0,0,0,0,2,0,0,0,0,1,4636,Dave & Buster's,"2130 Park Centre Dr, Ste 100",Las Vegas,NV,89135,36.148748,-115.332187,3.0,473,1,"{'Caters': 'False', 'RestaurantsTakeOut': 'Tru...","Restaurants, Arts & Entertainment, Sports Bars...","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60075,34dea4ede612fd05bd8431c597204b3d,1f3f068cd8717298ed51314cc0d47a77,3.0,60075,704,Tania,128,2007-02-22 17:11:02,118,32,67,,6,3.77,1,5,2,0,0,2,2,8,8,3,0,5704,Artisan Hotel Boutique,1501 W Sahara Ave,Las Vegas,NV,89102,36.143757,-115.169723,3.0,566,1,"{'BusinessAcceptsCreditCards': 'True', 'Music'...","Hotels & Travel, Event Planning & Services, Ho...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
60076,aa24183e54f6657ab3545557271bdc2a,670b348d2a411ec96a1e45c4764192dc,2.0,60076,292,Ennie,434,2009-08-27 18:32:54,330,127,160,2011,22,4.09,11,6,1,3,2,5,24,11,11,10,6,666,Momofuku Noodle Bar,"190 University Avenue, Ground Floor",Toronto,ON,M5H 0A3,43.649678,-79.386275,3.0,897,1,"{'NoiseLevel': ""u'loud'"", 'Caters': 'False', '...","Ramen, Soup, Korean, Japanese, Asian Fusion, A...","{'Monday': '17:0-22:30', 'Tuesday': '17:0-23:0..."
60077,b695bc3cf274961ccba9ab4d2bfaddb8,9e4b3792e6281bc37abfbc4c9eeb05eb,4.0,60077,452,A. Marie,221,2016-01-15 05:01:36,221,69,77,,17,3.70,3,1,0,2,0,10,41,4,4,1,5,2531,Fairmont Royal York,100 Front Street W,Toronto,ON,M5J 1E3,43.645694,-79.381884,3.5,207,1,"{'RestaurantsPriceRange2': '3', 'WiFi': ""u'pai...","Event Planning & Services, Hotels & Travel, Ho...",
60078,d6b815c6e6775df6bb61ea513e78fd52,c14d09d594c535aa5bd06ca537b9b19e,2.0,60078,1104,Calvin,1146,2011-01-26 01:17:19,1226,391,449,2012201320142015201620172018,43,3.33,15,4,2,0,1,38,32,27,27,20,5,4330,Cafe Hollywood,"Hollywood Square, 7240 Kennedy Road",Markham,ON,L3R 7P2,43.832656,-79.306583,3.0,206,1,"{'DogsAllowed': 'False', 'NoiseLevel': ""u'aver...","Fast Food, Restaurants, Chinese","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ..."


In [195]:
val_df

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,user_cool,user_elite,user_fans,user_average_stars,user_compliment_hot,user_compliment_more,user_compliment_profile,user_compliment_cute,user_compliment_list,user_compliment_note,user_compliment_plain,user_compliment_cool,user_compliment_funny,user_compliment_writer,user_compliment_photos,item_Unnamed: 0,item_name,item_address,item_city,item_state,item_postal_code,item_latitude,item_longitude,item_stars,item_review_count,item_is_open,item_attributes,item_categories,item_hours
0,8becc257dd950c530711e297be327e7b,5ed2bef8c901ebdcc96fedf2f4560c79,5.0,0,282,Niki,113,2014-02-23 19:01:44,135,21,37,201620172018,11,4.03,1,0,1,0,0,1,3,0,0,0,0,3183,Zest - Bistro & Bar,"10670 Southern Highlands Pkwy, Ste 102",Las Vegas,NV,89141,35.995151,-115.206840,4.5,566,1,"{'GoodForMeal': ""{'dessert': False, 'latenight...","Restaurants, Lounges, American (New), Italian,...","{'Monday': '16:0-22:0', 'Tuesday': '16:0-22:0'..."
1,6dc9688262188b67fcdaaf30ba5f2b41,5bfc74cc8eaa89e28ae9bbc071770fc4,5.0,1,1050,Bridgette,37,2014-07-12 13:16:07,46,13,15,,1,3.92,0,0,0,0,0,1,1,1,1,1,0,3884,The Watershed,5350 S Lakeshore Dr,Tempe,AZ,85283,33.375991,-111.919919,3.0,517,1,"{'RestaurantsPriceRange2': '2', 'BestNights': ...","Music Venues, Sports Bars, American (Tradition...","{'Monday': '17:0-2:0', 'Tuesday': '16:0-22:0',..."
2,97b4eceee3d20c884508469bf093e5c3,c0ab608051317680920ea102073b92e1,5.0,2,552,Ann,110,2013-01-16 20:26:12,281,189,253,,19,3.92,8,0,0,1,0,3,10,17,17,5,8,2874,Brew Tea Bar,"7380 S Rainbow Blvd, Ste 101",Las Vegas,NV,89139,36.054227,-115.242407,5.0,1506,1,"{'Alcohol': ""'none'"", 'BusinessAcceptsCreditCa...","Tea Rooms, Desserts, Cafes, Restaurants, Food,...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ..."
3,440847a87174f0371f82c9aca88a2c06,9dbd4b0c872e768640b62bf77c18ba59,2.0,3,955,Priya,117,2011-09-23 04:48:01,136,48,39,20132014201520162018,14,3.61,3,1,0,0,0,4,8,11,11,1,4,2633,The Dirty Bird Chicken + Waffles,79 Kensington Avenue,Toronto,ON,M5T 2K2,43.654647,-79.400561,3.5,284,1,"{'RestaurantsGoodForGroups': 'False', 'Busines...","Gluten-Free, Chicken Shop, Burgers, Fast Food,...","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'..."
4,d702349096c4afd3791720fdcac583cb,ac74f2fdf664962aaf45f55268e87b05,5.0,4,1245,Sarah,85,2013-05-07 14:34:21,220,31,130,2014201520172018,18,3.54,11,1,0,0,0,6,10,18,18,11,3,5414,Square Cafe,1137 S Braddock Ave,Pittsburgh,PA,15218,40.432114,-79.893269,4.0,463,1,"{'WiFi': ""u'free'"", 'RestaurantsGoodForGroups'...","Breakfast & Brunch, Vegetarian, Restaurants, F...","{'Monday': '0:0-0:0', 'Tuesday': '7:0-15:0', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7505,7921e00f76afd4b96fd173a72a2aff12,83f42ea866998d0c04ea721c1bd94845,4.0,7505,2439,Sandy And Jayce,64,2015-11-04 22:19:08,10,0,3,,0,4.48,0,0,0,0,0,0,1,1,1,0,0,4641,Scottsdale Fashion Square,7014 E Camelback Rd,Scottsdale,AZ,85251,33.503582,-111.929341,4.0,493,1,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Outlet Stores, Shopping Centers, Shopping","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'..."
7506,a46b6bd5649c8c0b82d27946683180ed,8c6424a3fca449b9cd35fa64cab7ca23,4.0,7506,815,MaryJane,76,2010-12-21 04:32:23,129,29,50,2015201620172018,9,3.85,6,1,0,0,0,4,0,3,3,6,2,3233,Les Deux Gamins,170 Rue Prince Arthur Est,Montréal,QC,H2X 1B7,45.515472,-73.570962,4.0,483,1,"{'RestaurantsTakeOut': 'False', 'WiFi': ""u'fre...","Restaurants, Bars, Cafes, Wine Bars, French, N...","{'Monday': '17:30-22:0', 'Tuesday': '17:30-22:..."
7507,e57b98041557760ee2dac27c64a6c796,113d571c7611c1ef355e64adc95cb91c,5.0,7507,1925,Kat,93,2011-03-18 16:36:31,250,56,36,,1,3.30,0,0,0,0,0,2,3,0,0,3,0,2820,Helio Basin Brewing,3935 E Thomas Rd,Phoenix,AZ,85018,33.479805,-111.996228,4.5,272,1,"{'Caters': 'False', 'GoodForKids': 'False', 'B...","American (New), Breweries, Restaurants, Food, ...","{'Tuesday': '15:0-23:0', 'Wednesday': '15:0-23..."
7508,434c38d8cfe7ef0f355899dde8338ee1,a2931c1ec86cdba598e9e97e92326a83,4.0,7508,1316,Colin,199,2011-05-07 19:28:24,331,67,233,2015201620172018,8,4.27,2,1,0,0,0,3,6,3,3,2,1,4396,Viva Las Arepas,"1616 S Las Vegas Blvd, Ste 120",Las Vegas,NV,89104,36.151897,-115.152534,4.5,1395,1,"{'RestaurantsAttire': ""u'casual'"", 'Restaurant...","Latin American, Venezuelan, Food Stands, Resta...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."


In [196]:
te_df

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,user_cool,user_elite,user_fans,user_average_stars,user_compliment_hot,user_compliment_more,user_compliment_profile,user_compliment_cute,user_compliment_list,user_compliment_note,user_compliment_plain,user_compliment_cool,user_compliment_funny,user_compliment_writer,user_compliment_photos,item_Unnamed: 0,item_name,item_address,item_city,item_state,item_postal_code,item_latitude,item_longitude,item_stars,item_review_count,item_is_open,item_attributes,item_categories,item_hours
0,39b627ea7d06c70fd36d5bfc6c23ddbb,d599f2a9f7e782f0a3fc16fd5dc22027,0.0,0,1253,Mel,47,2013-07-31 21:07:39,40,7,21,20172018,4,4.45,1,0,0,2,0,0,1,2,2,1,1,1816,Mac Shack,"8975 W Charleston Blvd, Ste 140",Las Vegas,NV,89117,36.157660,-115.290717,3.5,262,0,"{'Alcohol': ""u'beer_and_wine'"", 'HasTV': 'Fals...","Restaurants, Italian, Vegan, Gluten-Free","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
1,2fd27ce10e62ba2578d0c04799027a6e,69423d084bc0b22ea07aae598acc8c52,0.0,1,1933,Netta,34,2011-09-22 23:40:05,46,5,12,,0,3.51,0,0,0,0,0,1,0,0,0,0,0,4658,Kabuki Japanese Restaurant,6770 N Sunrise Blvd,Glendale,AZ,85305,33.533954,-112.261047,4.0,544,1,"{'RestaurantsPriceRange2': '2', 'RestaurantsDe...","Japanese, Restaurants, Sushi Bars, Asian Fusion","{'Monday': '11:0-21:30', 'Tuesday': '11:0-22:3..."
2,d7995d511b5493a4b6ec34af6290e080,9ef9cfddc1e6fab536d4517e204064be,0.0,2,106,Jeffrey,262,2007-12-03 18:57:27,451,296,273,200920102011,8,3.74,14,3,1,4,1,21,15,17,17,11,1,3148,Bouchon Bakery at the Venetian Theater,"3355 Las Vegas Blvd S, Venetian Theater, Sport...",Las Vegas,NV,89109,36.122910,-115.170716,4.0,1152,1,"{'RestaurantsGoodForGroups': 'True', 'Business...","Desserts, Food, French, Sandwiches, Bakeries, ...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ..."
3,36d8bb0193e3c785a2c47dba1abab521,4380c5fcdedd19169190eaae7de83986,0.0,3,981,Lynda,36,2013-02-03 02:55:18,50,21,23,,0,4.32,0,1,0,0,0,0,1,0,0,0,1,4972,Sassy Lashes - Henderson,"2560 St Rose Pkwy, Ste 100",Henderson,NV,89074,36.016537,-115.101612,4.5,217,1,"{'BusinessAcceptsBitcoin': 'False', 'BusinessP...","Beauty & Spas, Cosmetics & Beauty Supply, Shop...","{'Monday': '8:0-0:0', 'Tuesday': '8:0-0:0', 'W..."
4,b4a3aea52dea7ad3fc324a8c8d4356fc,88cd731bf93dd4706e824d0663018808,0.0,4,658,Liz,135,2009-07-08 02:17:34,230,75,87,,14,3.43,8,7,2,0,0,14,7,4,4,5,2,754,Angelina's Pho & Grill Bar,"5350 W Bell Rd, Ste 123-124",Glendale,AZ,85308,33.640039,-112.174073,4.0,393,1,"{'Caters': 'True', 'OutdoorSeating': 'True', '...","Asian Fusion, Vietnamese, Seafood, Sushi Bars,...","{'Tuesday': '16:15-21:0', 'Wednesday': '16:15-..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7505,42326317a1c3456c7ade772762f41d19,e4cde8be05f410c5f29dfe0cb76b21b5,0.0,7505,128,Missy,205,2014-09-16 04:25:04,2467,1182,1980,201620172018,68,3.63,85,7,14,2,1,23,83,200,200,70,59,3062,Scramble,"6590 N Scottsdale Rd, Ste 100",Scottsdale,AZ,85253,33.533520,-111.926206,4.0,439,1,"{'RestaurantsReservations': 'False', 'Restaura...","American (Traditional), Restaurants, Vegan, Br...","{'Monday': '6:0-14:0', 'Tuesday': '6:0-14:0', ..."
7506,a557347314e8002c6ae452071e6b7e18,a074be3d6a7f48fd17d37aa7acfe10f1,0.0,7506,2169,Wej,39,2015-06-17 19:39:11,18,8,6,,1,3.60,0,0,0,0,0,0,0,0,0,0,0,267,SUSHISAMBA - Las Vegas,3327 Las Vegas Blvd S,Las Vegas,NV,89109,36.124582,-115.167553,4.0,2355,1,"{'GoodForDancing': 'False', 'BusinessParking':...","Bars, Dim Sum, Japanese, Asian Fusion, Restaur...","{'Monday': '11:30-1:0', 'Tuesday': '11:30-1:0'..."
7507,59c52c7a858503d1bb970923e08d1b0c,65399b606015d6148f10f927dd4ae0ff,0.0,7507,1441,Rose,73,2014-05-24 18:17:35,113,40,39,2017,2,3.89,0,0,0,0,0,4,7,1,1,0,0,2387,Archi's Thai Bistro,"6345 S Rainbow Blvd, Ste 100",Las Vegas,NV,89118,36.073206,-115.243538,4.0,770,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Ethnic Food, Thai, Food, Specialt...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
7508,f0fa7ac5cd64443f3cfe4b2042e4a830,94bb8df33b17a8e7678ec8d646950148,0.0,7508,279,Yas,97,2013-01-20 16:53:23,31,6,13,20142018,3,3.90,2,0,0,0,0,0,3,0,0,0,1,2975,Alihan's Mediterranean Cuisine,124 6th St,Pittsburgh,PA,15222,40.443260,-80.002570,4.5,187,1,"{'RestaurantsPriceRange2': '2', 'BikeParking':...","Breakfast & Brunch, Restaurants, Turkish, Medi...","{'Monday': '14:0-21:0', 'Tuesday': '11:0-22:0'..."


In [197]:
import pandas as pd
import ast
import math

def isnan(value):
  try:
      return math.isnan(float(value))
  except:
      return False
  
def convert_value(value):
  if value == "None":
    return None
  if isinstance(value, bool):
    return 1 if value else 0
  if value.isdecimal():
    return float(value)
  if value == "yes":
    return 1
  if value == "no":
    return 0
  return value

def expand_json_column(df, attributes): 
  for attribute in attributes:
    list_dict = []
    for i,dict_str in enumerate(df[attribute]):

      # check for nan
      if not isnan(dict_str):
        dict_str = dict_str.replace('"','').replace("'False'","False").replace("'True'","True")
        temp_dict = ast.literal_eval(dict_str)

        temptemp_dict = {}
        for k, v in temp_dict.items():
          # print(k, v)
          if isinstance(v, dict):
            for k1, v1 in v.items():
              value = convert_value(v1)
              if value:
                temptemp_dict[f"{k}_{k1}"] = value
          else:
            value = convert_value(v)
            if value:
              temptemp_dict[k] = value
        
        list_dict.append(temptemp_dict)

    new_df = pd.DataFrame(list_dict)
    df = df.join(new_df, lsuffix='_caller', rsuffix='_other')

  return df

def expand_list_column(df, attributes):
  for attribute in attributes:
    list_dict = []
    for i,list_str in enumerate(df[attribute]):

       # check for nan
      if not isnan(list_str):

        temp_dict = {}
        
        for key in list_str.split(','):
          temp_dict[key] = 1
      list_dict.append(temp_dict)

    new_df = pd.DataFrame(list_dict)
    df = df.join(new_df, lsuffix='_caller', rsuffix='_other')
  return df


In [198]:
tr_df = expand_json_column(tr_df,["item_attributes"])
val_df = expand_json_column(val_df,["item_attributes"])
te_df = expand_json_column(te_df,["item_attributes"])

In [199]:
# add columns and reorder the columns in val and test
new_columns = ['HairSpecializesIn_kids', 'HairSpecializesIn_straightperms', 'HairSpecializesIn_perms', 'HairSpecializesIn_asian']
new_data = np.zeros((int(len(tr_df)), int(len(new_columns))))
new_val = pd.DataFrame(new_data, columns = new_columns)
val_df = pd.concat((val_df, new_val), axis=1)
val_df = val_df[tr_df.columns]
te_df = te_df[tr_df.columns]

In [16]:
#tr_df.to_csv('temp.csv',index=False)

In [116]:
#tr_df.iloc[:,39:]

In [201]:
### remove the columns with too low frequency
for column in tr_df.iloc[:,97:]:
    tr_df = tr_df.drop([column], axis=1)
    
for column in tr_df.iloc[:,39:97]:
    print(tr_df[column].unique())
    if len(tr_df[column].unique()) < 3:
        tr_df[column] = tr_df[column].fillna(0)
    else:
        tr_df[column] = tr_df[column].fillna('Nan')
    print(tr_df[column].value_counts())

[ 1. nan]
0.0    54516
1.0     5564
Name: GoodForMeal_latenight, dtype: int64
[ 1. nan]
1.0    34895
0.0    25185
Name: GoodForMeal_lunch, dtype: int64
[ 1. nan]
1.0    39100
0.0    20980
Name: GoodForMeal_dinner, dtype: int64
[ 1. nan]
1.0    50199
0.0     9881
Name: RestaurantsGoodForGroups, dtype: int64
['yes_corkage' nan 'yes_free']
Nan            56073
yes_free        2936
yes_corkage     1071
Name: BYOBCorkage, dtype: int64
[ 1. nan]
1.0    52503
0.0     7577
Name: BusinessAcceptsCreditCards, dtype: int64
[ 1. nan]
1.0    40769
0.0    19311
Name: GoodForKids, dtype: int64
[ 1. nan]
0.0    58974
1.0     1106
Name: Corkage, dtype: int64
[ 1. nan]
1.0    47742
0.0    12338
Name: RestaurantsTakeOut, dtype: int64
[ 1. nan]
1.0    33037
0.0    27043
Name: BusinessParking_lot, dtype: int64
[ 1. nan]
1.0    32001
0.0    28079
Name: HasTV, dtype: int64
[ 1.  2.  4.  3. nan]
2.0    39790
1.0    10735
3.0     5912
Nan     2542
4.0     1101
Name: RestaurantsPriceRange2, dtype: int64
['casual

In [202]:
for column in tr_df.iloc[:,39:97]:
    if len(tr_df[column].unique()) > 2:
        tr_df = pd.concat([tr_df, pd.get_dummies(tr_df[column], prefix=column)], axis=1)
        tr_df = tr_df.drop([column],axis=1)

In [203]:
tr_df

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,user_cool,user_elite,user_fans,user_average_stars,user_compliment_hot,user_compliment_more,user_compliment_profile,user_compliment_cute,user_compliment_list,user_compliment_note,user_compliment_plain,user_compliment_cool,user_compliment_funny,user_compliment_writer,user_compliment_photos,item_Unnamed: 0,item_name,item_address,item_city,item_state,item_postal_code,item_latitude,item_longitude,item_stars,item_review_count,item_is_open,item_attributes,item_categories,item_hours,GoodForMeal_latenight,GoodForMeal_lunch,GoodForMeal_dinner,RestaurantsGoodForGroups,BusinessAcceptsCreditCards,GoodForKids,Corkage,RestaurantsTakeOut,BusinessParking_lot,HasTV,BestNights_friday,BestNights_wednesday,BestNights_saturday,BikeParking,RestaurantsTableService,Ambience_divey,RestaurantsReservations,RestaurantsDelivery,Caters,Ambience_casual,WheelchairAccessible,BusinessParking_garage,BusinessParking_valet,Ambience_romantic,OutdoorSeating,Ambience_trendy,BusinessParking_street,HappyHour,GoodForMeal_dessert,GoodForMeal_brunch,GoodForMeal_breakfast,Ambience_hipster,Open24Hours,DogsAllowed,BestNights_tuesday,Music_background_music,BestNights_sunday,BestNights_thursday,DriveThru,Ambience_classy,Music_live,Music_dj,BusinessParking_validated,Ambience_upscale,BestNights_monday,ByAppointmentOnly,CoatCheck,GoodForDancing,Ambience_intimate,Ambience_touristy,BYOBCorkage_Nan,BYOBCorkage_yes_corkage,BYOBCorkage_yes_free,RestaurantsPriceRange2_1.0,RestaurantsPriceRange2_2.0,RestaurantsPriceRange2_3.0,RestaurantsPriceRange2_4.0,RestaurantsPriceRange2_Nan,RestaurantsAttire_Nan,RestaurantsAttire_casual,RestaurantsAttire_dressy,Alcohol_Nan,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,NoiseLevel_Nan,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,WiFi_Nan,WiFi_free,WiFi_paid,Smoking_1,Smoking_Nan,Smoking_outdoor,AgesAllowed_21plus,AgesAllowed_Nan,AgesAllowed_allages
0,11d83531dcee4af6f5701696ff45a8d8,54eb8c224740ce5a8a1db56324cbf0c0,3.0,0,1666,Jen,75,2007-07-19 00:56:23,150,83,56,,2,3.25,0,0,0,0,0,4,5,2,2,0,1,5188,Four Kegs Sports Bar,"276 N Jones Blvd, Ste B",Las Vegas,NV,89107,36.175904,-115.223262,4.0,441,1,"{'OutdoorSeating': 'False', 'GoodForMeal': ""{'...","American (New), American (Traditional), Sports...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
1,f3270a0bfa9f67f4b00fd9dadbf1d3e4,e398e51ecca29473c80b058ab17e903e,5.0,1,298,Matthew,430,2014-05-27 06:33:22,1713,466,1155,201620172018,73,4.58,26,3,1,0,0,11,47,67,67,21,30,4590,Thai Spices,"66 S Dobson Rd, Ste 133",Mesa,AZ,85202,33.412708,-111.875803,4.0,343,1,"{'DogsAllowed': 'False', 'Alcohol': ""u'full_ba...","Thai, Restaurants","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3...",0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
2,8515478d756cecae1d46ad274e583fba,62d5a2614594cdfe014cb3f47bc1f183,4.0,2,847,Danielle,66,2013-09-27 21:02:56,186,50,98,2015,16,4.03,12,1,0,0,0,7,15,6,6,9,1,3033,Lakeside,3131 Las Vegas Blvd S,Las Vegas,NV,89109,36.126576,-115.166935,3.5,450,1,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Restaurants, Steakhouses, American (New), Seafood","{'Monday': '17:30-22:0', 'Tuesday': '17:30-22:...",0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
3,f7a8d4e39387479bee5786d9af7a384e,5e4ad90afb13df5004803bd165a905a9,5.0,3,1153,Lana,247,2013-07-20 22:03:06,363,145,223,2015201620172018,9,3.88,2,0,1,1,0,4,10,10,10,6,1,5268,Saku Sushi,478 Queen Street W,Toronto,ON,M5V 2B2,43.648086,-79.400362,4.0,401,1,"{'HasTV': 'False', 'RestaurantsReservations': ...","Breakfast & Brunch, Sushi Bars, Japanese, Rest...","{'Monday': '12:0-22:30', 'Tuesday': '12:0-23:3...",0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
4,edbc6b267c51aaa5f89ec35237126649,cff931388a0021c662fa8d3437448a89,2.0,4,368,G,113,2012-04-11 07:50:10,53,11,17,,0,3.68,0,0,0,0,0,2,0,0,0,0,1,4636,Dave & Buster's,"2130 Park Centre Dr, Ste 100",Las Vegas,NV,89135,36.148748,-115.332187,3.0,473,1,"{'Caters': 'False', 'RestaurantsTakeOut': 'Tru...","Restaurants, Arts & Entertainment, Sports Bars...","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60075,34dea4ede612fd05bd8431c597204b3d,1f3f068cd8717298ed51314cc0d47a77,3.0,60075,704,Tania,128,2007-02-22 17:11:02,118,32,67,,6,3.77,1,5,2,0,0,2,2,8,8,3,0,5704,Artisan Hotel Boutique,1501 W Sahara Ave,Las Vegas,NV,89102,36.143757,-115.169723,3.0,566,1,"{'BusinessAcceptsCreditCards': 'True', 'Music'...","Hotels & Travel, Event Planning & Services, Ho...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
60076,aa24183e54f6657ab3545557271bdc2a,670b348d2a411ec96a1e45c4764192dc,2.0,60076,292,Ennie,434,2009-08-27 18:32:54,330,127,160,2011,22,4.09,11,6,1,3,2,5,24,11,11,10,6,666,Momofuku Noodle Bar,"190 University Avenue, Ground Floor",Toronto,ON,M5H 0A3,43.649678,-79.386275,3.0,897,1,"{'NoiseLevel': ""u'loud'"", 'Caters': 'False', '...","Ramen, Soup, Korean, Japanese, Asian Fusion, A...","{'Monday': '17:0-22:30', 'Tuesday': '17:0-23:0...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
60077,b695bc3cf274961ccba9ab4d2bfaddb8,9e4b3792e6281bc37abfbc4c9eeb05eb,4.0,60077,452,A. Marie,221,2016-01-15 05:01:36,221,69,77,,17,3.70,3,1,0,2,0,10,41,4,4,1,5,2531,Fairmont Royal York,100 Front Street W,Toronto,ON,M5J 1E3,43.645694,-79.381884,3.5,207,1,"{'RestaurantsPriceRange2': '3', 'WiFi': ""u'pai...","Event Planning & Services, Hotels & Travel, Ho...",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
60078,d6b815c6e6775df6bb61ea513e78fd52,c14d09d594c535aa5bd06ca537b9b19e,2.0,60078,1104,Calvin,1146,2011-01-26 01:17:19,1226,391,449,2012201320142015201620172018,43,3.33,15,4,2,0,1,38,32,27,27,20,5,4330,Cafe Hollywood,"Hollywood Square, 7240 Kennedy Road",Markham,ON,L3R 7P2,43.832656,-79.306583,3.0,206,1,"{'DogsAllowed': 'False', 'NoiseLevel': ""u'aver...","Fast Food, Restaurants, Chinese","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0


In [204]:
### remove the columns with too low frequency
for column in val_df.iloc[:,97:]:
    val_df = val_df.drop([column], axis=1)
    
for column in val_df.iloc[:,39:97]:
    print(val_df[column].unique())
    if len(val_df[column].unique()) < 3:
        val_df[column] = val_df[column].fillna(0)
    else:
        val_df[column] = val_df[column].fillna('Nan')
    print(val_df[column].value_counts())

for column in val_df.iloc[:,39:97]:
    if len(val_df[column].unique()) > 2:
        val_df = pd.concat([val_df, pd.get_dummies(val_df[column], prefix=column)], axis=1)
        val_df = val_df.drop([column],axis=1)

[nan  1.]
0.0    59348
1.0      732
Name: GoodForMeal_latenight, dtype: int64
[nan  1.]
0.0    55696
1.0     4384
Name: GoodForMeal_lunch, dtype: int64
[ 1. nan]
0.0    55129
1.0     4951
Name: GoodForMeal_dinner, dtype: int64
[ 1. nan]
0.0    53782
1.0     6298
Name: RestaurantsGoodForGroups, dtype: int64
[nan 'yes_free' 'yes_corkage']
Nan            59594
yes_free         342
yes_corkage      144
Name: BYOBCorkage, dtype: int64
[ 1. nan]
0.0    53548
1.0     6532
Name: BusinessAcceptsCreditCards, dtype: int64
[ 1. nan]
0.0    54967
1.0     5113
Name: GoodForKids, dtype: int64
[nan  1.]
0.0    59950
1.0      130
Name: Corkage, dtype: int64
[ 1. nan]
0.0    54073
1.0     6007
Name: RestaurantsTakeOut, dtype: int64
[ 1. nan]
0.0    55985
1.0     4095
Name: BusinessParking_lot, dtype: int64
[ 1. nan]
0.0    56117
1.0     3963
Name: HasTV, dtype: int64
[ 2.  1.  3. nan  4.]
Nan    52896
2.0     4943
1.0     1368
3.0      735
4.0      138
Name: RestaurantsPriceRange2, dtype: int64
['casual

In [205]:
val_df

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,user_cool,user_elite,user_fans,user_average_stars,user_compliment_hot,user_compliment_more,user_compliment_profile,user_compliment_cute,user_compliment_list,user_compliment_note,user_compliment_plain,user_compliment_cool,user_compliment_funny,user_compliment_writer,user_compliment_photos,item_Unnamed: 0,item_name,item_address,item_city,item_state,item_postal_code,item_latitude,item_longitude,item_stars,item_review_count,item_is_open,item_attributes,item_categories,item_hours,GoodForMeal_latenight,GoodForMeal_lunch,GoodForMeal_dinner,RestaurantsGoodForGroups,BusinessAcceptsCreditCards,GoodForKids,Corkage,RestaurantsTakeOut,BusinessParking_lot,HasTV,BestNights_friday,BestNights_wednesday,BestNights_saturday,BikeParking,RestaurantsTableService,Ambience_divey,RestaurantsReservations,RestaurantsDelivery,Caters,Ambience_casual,WheelchairAccessible,BusinessParking_garage,BusinessParking_valet,Ambience_romantic,OutdoorSeating,Ambience_trendy,BusinessParking_street,HappyHour,GoodForMeal_dessert,GoodForMeal_brunch,GoodForMeal_breakfast,Ambience_hipster,Open24Hours,DogsAllowed,BestNights_tuesday,Music_background_music,BestNights_sunday,BestNights_thursday,DriveThru,Ambience_classy,Music_live,Music_dj,BusinessParking_validated,Ambience_upscale,BestNights_monday,ByAppointmentOnly,CoatCheck,GoodForDancing,Ambience_intimate,Ambience_touristy,BYOBCorkage_Nan,BYOBCorkage_yes_corkage,BYOBCorkage_yes_free,RestaurantsPriceRange2_1.0,RestaurantsPriceRange2_2.0,RestaurantsPriceRange2_3.0,RestaurantsPriceRange2_4.0,RestaurantsPriceRange2_Nan,RestaurantsAttire_Nan,RestaurantsAttire_casual,RestaurantsAttire_dressy,Alcohol_Nan,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,NoiseLevel_Nan,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,WiFi_Nan,WiFi_free,WiFi_paid,Smoking_1,Smoking_Nan,Smoking_outdoor,AgesAllowed_21plus,AgesAllowed_Nan,AgesAllowed_allages
0,8becc257dd950c530711e297be327e7b,5ed2bef8c901ebdcc96fedf2f4560c79,5.0,0.0,282.0,Niki,113.0,2014-02-23 19:01:44,135.0,21.0,37.0,201620172018,11.0,4.03,1.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,3183.0,Zest - Bistro & Bar,"10670 Southern Highlands Pkwy, Ste 102",Las Vegas,NV,89141,35.995151,-115.206840,4.5,566.0,1.0,"{'GoodForMeal': ""{'dessert': False, 'latenight...","Restaurants, Lounges, American (New), Italian,...","{'Monday': '16:0-22:0', 'Tuesday': '16:0-22:0'...",0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0
1,6dc9688262188b67fcdaaf30ba5f2b41,5bfc74cc8eaa89e28ae9bbc071770fc4,5.0,1.0,1050.0,Bridgette,37.0,2014-07-12 13:16:07,46.0,13.0,15.0,,1.0,3.92,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,3884.0,The Watershed,5350 S Lakeshore Dr,Tempe,AZ,85283,33.375991,-111.919919,3.0,517.0,1.0,"{'RestaurantsPriceRange2': '2', 'BestNights': ...","Music Venues, Sports Bars, American (Tradition...","{'Monday': '17:0-2:0', 'Tuesday': '16:0-22:0',...",0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0
2,97b4eceee3d20c884508469bf093e5c3,c0ab608051317680920ea102073b92e1,5.0,2.0,552.0,Ann,110.0,2013-01-16 20:26:12,281.0,189.0,253.0,,19.0,3.92,8.0,0.0,0.0,1.0,0.0,3.0,10.0,17.0,17.0,5.0,8.0,2874.0,Brew Tea Bar,"7380 S Rainbow Blvd, Ste 101",Las Vegas,NV,89139,36.054227,-115.242407,5.0,1506.0,1.0,"{'Alcohol': ""'none'"", 'BusinessAcceptsCreditCa...","Tea Rooms, Desserts, Cafes, Restaurants, Food,...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ...",0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0
3,440847a87174f0371f82c9aca88a2c06,9dbd4b0c872e768640b62bf77c18ba59,2.0,3.0,955.0,Priya,117.0,2011-09-23 04:48:01,136.0,48.0,39.0,20132014201520162018,14.0,3.61,3.0,1.0,0.0,0.0,0.0,4.0,8.0,11.0,11.0,1.0,4.0,2633.0,The Dirty Bird Chicken + Waffles,79 Kensington Avenue,Toronto,ON,M5T 2K2,43.654647,-79.400561,3.5,284.0,1.0,"{'RestaurantsGoodForGroups': 'False', 'Busines...","Gluten-Free, Chicken Shop, Burgers, Fast Food,...","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...",0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0
4,d702349096c4afd3791720fdcac583cb,ac74f2fdf664962aaf45f55268e87b05,5.0,4.0,1245.0,Sarah,85.0,2013-05-07 14:34:21,220.0,31.0,130.0,2014201520172018,18.0,3.54,11.0,1.0,0.0,0.0,0.0,6.0,10.0,18.0,18.0,11.0,3.0,5414.0,Square Cafe,1137 S Braddock Ave,Pittsburgh,PA,15218,40.432114,-79.893269,4.0,463.0,1.0,"{'WiFi': ""u'free'"", 'RestaurantsGoodForGroups'...","Breakfast & Brunch, Vegetarian, Restaurants, F...","{'Monday': '0:0-0:0', 'Tuesday': '7:0-15:0', '...",0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60075,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
60076,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
60077,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
60078,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0


In [23]:
te_df.head()

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,...,HairSpecializesIn_extensions,HairSpecializesIn_africanamerican,HairSpecializesIn_curly,HairSpecializesIn_kids,HairSpecializesIn_perms,HairSpecializesIn_asian,Music_karaoke,DietaryRestrictions_gluten-free,BYOB,AcceptsInsurance
0,39b627ea7d06c70fd36d5bfc6c23ddbb,d599f2a9f7e782f0a3fc16fd5dc22027,0.0,0,1253,Mel,47,2013-07-31 21:07:39,40,7,...,,,,,,,,,,
1,2fd27ce10e62ba2578d0c04799027a6e,69423d084bc0b22ea07aae598acc8c52,0.0,1,1933,Netta,34,2011-09-22 23:40:05,46,5,...,,,,,,,,,,
2,d7995d511b5493a4b6ec34af6290e080,9ef9cfddc1e6fab536d4517e204064be,0.0,2,106,Jeffrey,262,2007-12-03 18:57:27,451,296,...,,,,,,,,,,
3,36d8bb0193e3c785a2c47dba1abab521,4380c5fcdedd19169190eaae7de83986,0.0,3,981,Lynda,36,2013-02-03 02:55:18,50,21,...,,,,,,,,,,
4,b4a3aea52dea7ad3fc324a8c8d4356fc,88cd731bf93dd4706e824d0663018808,0.0,4,658,Liz,135,2009-07-08 02:17:34,230,75,...,,,,,,,,,,


In [206]:
set(tr_df.columns) - set(te_df.columns)

{'AgesAllowed_21plus',
 'AgesAllowed_Nan',
 'AgesAllowed_allages',
 'Alcohol_Nan',
 'Alcohol_beer_and_wine',
 'Alcohol_full_bar',
 'Alcohol_none',
 'BYOBCorkage_Nan',
 'BYOBCorkage_yes_corkage',
 'BYOBCorkage_yes_free',
 'NoiseLevel_Nan',
 'NoiseLevel_average',
 'NoiseLevel_loud',
 'NoiseLevel_quiet',
 'NoiseLevel_very_loud',
 'RestaurantsAttire_Nan',
 'RestaurantsAttire_casual',
 'RestaurantsAttire_dressy',
 'RestaurantsPriceRange2_1.0',
 'RestaurantsPriceRange2_2.0',
 'RestaurantsPriceRange2_3.0',
 'RestaurantsPriceRange2_4.0',
 'RestaurantsPriceRange2_Nan',
 'Smoking_1',
 'Smoking_Nan',
 'Smoking_outdoor',
 'WiFi_Nan',
 'WiFi_free',
 'WiFi_paid'}

In [208]:
te_df['AgesAllowed_21plus']

KeyError: 'AgesAllowed_21plus'

In [113]:
for col in list(set(tr_df.columns) - set(val_df.columns)):
  val_df[col] = None

In [114]:
set(tr_df.columns) - set(val_df.columns)

set()

In [115]:
pd.options.display.max_columns = None
tr_df.head()

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,user_cool,user_elite,user_fans,user_average_stars,user_compliment_hot,user_compliment_more,user_compliment_profile,user_compliment_cute,user_compliment_list,user_compliment_note,user_compliment_plain,user_compliment_cool,user_compliment_funny,user_compliment_writer,user_compliment_photos,item_Unnamed: 0,item_name,item_address,item_city,item_state,item_postal_code,item_latitude,item_longitude,item_stars,item_review_count,item_is_open,item_attributes,item_categories,item_hours,GoodForMeal_latenight,GoodForMeal_lunch,GoodForMeal_dinner,RestaurantsGoodForGroups,BusinessAcceptsCreditCards,GoodForKids,Corkage,RestaurantsTakeOut,BusinessParking_lot,HasTV,BestNights_friday,BestNights_wednesday,BestNights_saturday,BikeParking,RestaurantsTableService,Ambience_divey,RestaurantsReservations,RestaurantsDelivery,Caters,Ambience_casual,WheelchairAccessible,BusinessParking_garage,BusinessParking_valet,Ambience_romantic,OutdoorSeating,Ambience_trendy,BusinessParking_street,HappyHour,GoodForMeal_dessert,GoodForMeal_brunch,GoodForMeal_breakfast,Ambience_hipster,Open24Hours,DogsAllowed,BestNights_tuesday,Music_background_music,BestNights_sunday,BestNights_thursday,DriveThru,Ambience_classy,Music_live,Music_dj,BusinessParking_validated,Ambience_upscale,BestNights_monday,ByAppointmentOnly,CoatCheck,GoodForDancing,Ambience_intimate,Ambience_touristy,BYOBCorkage_Nan,BYOBCorkage_yes_corkage,BYOBCorkage_yes_free,RestaurantsPriceRange2_1.0,RestaurantsPriceRange2_2.0,RestaurantsPriceRange2_3.0,RestaurantsPriceRange2_4.0,RestaurantsPriceRange2_Nan,RestaurantsAttire_Nan,RestaurantsAttire_casual,RestaurantsAttire_dressy,Alcohol_Nan,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,NoiseLevel_Nan,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,WiFi_Nan,WiFi_free,WiFi_paid,Smoking_1,Smoking_Nan,Smoking_outdoor,AgesAllowed_21plus,AgesAllowed_Nan,AgesAllowed_allages
0,11d83531dcee4af6f5701696ff45a8d8,54eb8c224740ce5a8a1db56324cbf0c0,3.0,0,1666,Jen,75,2007-07-19 00:56:23,150,83,56,,2,3.25,0,0,0,0,0,4,5,2,2,0,1,5188,Four Kegs Sports Bar,"276 N Jones Blvd, Ste B",Las Vegas,NV,89107,36.175904,-115.223262,4.0,441,1,"{'OutdoorSeating': 'False', 'GoodForMeal': ""{'...","American (New), American (Traditional), Sports...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
1,f3270a0bfa9f67f4b00fd9dadbf1d3e4,e398e51ecca29473c80b058ab17e903e,5.0,1,298,Matthew,430,2014-05-27 06:33:22,1713,466,1155,201620172018.0,73,4.58,26,3,1,0,0,11,47,67,67,21,30,4590,Thai Spices,"66 S Dobson Rd, Ste 133",Mesa,AZ,85202,33.412708,-111.875803,4.0,343,1,"{'DogsAllowed': 'False', 'Alcohol': ""u'full_ba...","Thai, Restaurants","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3...",0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
2,8515478d756cecae1d46ad274e583fba,62d5a2614594cdfe014cb3f47bc1f183,4.0,2,847,Danielle,66,2013-09-27 21:02:56,186,50,98,2015.0,16,4.03,12,1,0,0,0,7,15,6,6,9,1,3033,Lakeside,3131 Las Vegas Blvd S,Las Vegas,NV,89109,36.126576,-115.166935,3.5,450,1,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Restaurants, Steakhouses, American (New), Seafood","{'Monday': '17:30-22:0', 'Tuesday': '17:30-22:...",0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
3,f7a8d4e39387479bee5786d9af7a384e,5e4ad90afb13df5004803bd165a905a9,5.0,3,1153,Lana,247,2013-07-20 22:03:06,363,145,223,2015201620172018.0,9,3.88,2,0,1,1,0,4,10,10,10,6,1,5268,Saku Sushi,478 Queen Street W,Toronto,ON,M5V 2B2,43.648086,-79.400362,4.0,401,1,"{'HasTV': 'False', 'RestaurantsReservations': ...","Breakfast & Brunch, Sushi Bars, Japanese, Rest...","{'Monday': '12:0-22:30', 'Tuesday': '12:0-23:3...",0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
4,edbc6b267c51aaa5f89ec35237126649,cff931388a0021c662fa8d3437448a89,2.0,4,368,G,113,2012-04-11 07:50:10,53,11,17,,0,3.68,0,0,0,0,0,2,0,0,0,0,1,4636,Dave & Buster's,"2130 Park Centre Dr, Ste 100",Las Vegas,NV,89135,36.148748,-115.332187,3.0,473,1,"{'Caters': 'False', 'RestaurantsTakeOut': 'Tru...","Restaurants, Arts & Entertainment, Sports Bars...","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0


### Prepare continuous features

In [28]:
# Specify the columns containing conitnuous features
continuous_columns = ["user_average_stars", "user_cool", "user_fans", 
                      "user_review_count", "user_useful", "user_funny",
                      "item_is_open", "item_latitude", "item_longitude", 
                      "item_review_count", "item_stars"]

# Get values of continous features for train/validation/test sets using the utility function defined previously

tr_continuous_features = get_continuous_features(tr_df, continuous_columns)
val_continuous_features = get_continuous_features(val_df, continuous_columns)
te_continuous_features = get_continuous_features(te_df, continuous_columns)

# Standardize each feature by removing the mean of the training samples and scaling to unit variance.
# See https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html for more details.
scaler = StandardScaler().fit(tr_continuous_features)

tr_continuous_features = scaler.transform(tr_continuous_features)
val_continuous_features = scaler.transform(val_continuous_features)
te_continuous_features = scaler.transform(te_continuous_features)

### Prepare deep categorical features

['GoodForMeal_latenight',
 'GoodForMeal_lunch',
 'GoodForMeal_dinner',
 'RestaurantsGoodForGroups',
 'BusinessAcceptsCreditCards',
 'GoodForKids',
 'Corkage',
 'RestaurantsTakeOut',
 'BusinessParking_lot',
 'HasTV',
 'BestNights_friday',
 'BestNights_wednesday',
 'BestNights_saturday',
 'BikeParking',
 'RestaurantsTableService',
 'Ambience_divey',
 'RestaurantsReservations',
 'RestaurantsDelivery',
 'Caters',
 'Ambience_casual',
 'WheelchairAccessible',
 'BusinessParking_garage',
 'BusinessParking_valet',
 'Ambience_romantic',
 'OutdoorSeating',
 'Ambience_trendy',
 'BusinessParking_street',
 'HappyHour',
 'GoodForMeal_dessert',
 'GoodForMeal_brunch',
 'GoodForMeal_breakfast',
 'Ambience_hipster',
 'Open24Hours',
 'DogsAllowed',
 'BestNights_tuesday',
 'Music_background_music',
 'BestNights_sunday',
 'BestNights_thursday',
 'DriveThru',
 'Ambience_classy',
 'Music_live',
 'Music_dj',
 'BusinessParking_validated',
 'Ambience_upscale',
 'BestNights_monday',
 'ByAppointmentOnly',
 'CoatCh

In [96]:
# Sepcify column names of deep categorical features
item_deep_columns = ["item_city", "item_postal_code", "item_state"]
new_item_deep_columns = item_deep_columns + list(tr_df.iloc[:,39:].columns)

# An array of integers where deep_vocab_lens[i] represents the number of unique values of (i+1)-th deep categorical feature
item_deep_vocab_lens = []

for col_name in new_item_deep_columns:
    # Get all unique values of this deep categorical feature
    try:
        tmp = item_df[col_name].unique()
    except:
        tep = tr_df.iloc[:,39:][col_name].unique()
    
    # Create a dictionary mapping each unique value to a unique integral index
    vocab = dict(zip(tmp, range(1, len(tmp) + 1)))
    # Get the number of unique values of this deep categorical features
    item_deep_vocab_lens.append(len(vocab) + 1)
    
    # Create a new column where each entry stores the integral index of this deep categorical feature's value in the same row
    try:
        item_df[col_name + "_idx"] = item_df[col_name].apply(lambda x: vocab[x])
    except:
        pass

# Create a dictionary mapping each business id to corresponding values of deep categorical features
item_deep_idx_columns = [t + "_idx" for t in item_deep_columns]
item_to_deep_categorical_features = dict(zip(item_df.business_id.values, item_df[item_deep_idx_columns].values.tolist()))

# Creat numpy arrays storing corresponding deep categorical features' values of train/validation/test sets using the above mapping
tr_deep_categorical_features = np.array(tr_df.business_id.apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())
val_deep_categorical_features = np.array(val_df.business_id.apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())
te_deep_categorical_features = np.array(te_df.business_id.apply(lambda x: item_to_deep_categorical_features[x]).values.tolist())


In [97]:
print(item_deep_vocab_lens)

[118, 799, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]


In [98]:
tr_deep_categorical_features = np.concatenate((tr_deep_categorical_features, tr_df.iloc[:,39:].to_numpy()), axis=1)

In [100]:
tr_deep_categorical_features.shape

(60080, 82)

### Prepare wide features

##### Prepare binary encoding for each selected category

In [101]:
# Collect the categories of all items 
all_categories = [category for category_list in item_df.item_categories.values for category in category_list.split(", ")]

# Sort all unique values of the item categories by their frequencies in descending order
category_sorted = sorted(Counter(all_categories).items(), key=lambda x: x[1], reverse=True)

# Select top 500 most frequent categories
selected_categories = [t[0] for t in category_sorted[:500]]

# Create a dictionary mapping each secleted category to a unique integral index
selected_categories_to_idx = dict(zip(selected_categories, range(1, len(selected_categories) + 1)))

# Map all categories unseen in the item df to index 0
selected_categories_to_idx['unk'] = 0

# Create a dictionary mapping each integral index to corresponding category
idx_to_selected_categories = {val: key for key, val in selected_categories_to_idx.items()}


##### Prepare cross product transformation for categories

In [102]:
# Get most frequent categories combinantions using the utility function defined previously and store them in the folloing list
top_combinations = []

# Get top 50 most frequent two-categories combinantions in the train set

top_combinations += get_top_k_p_combinations(tr_df, 2, 50, output_freq=False)

# Get top 30 most frequent three-categories combinantions in the train set
top_combinations += get_top_k_p_combinations(tr_df, 3, 30, output_freq=False)

# Get top 20 most frequent four-categories combinantions in the train set
top_combinations += get_top_k_p_combinations(tr_df, 4, 20, output_freq=False)

# Convert each combinantion in the list to a set data structure
top_combinations = [set(t) for t in top_combinations]

In [103]:
# Get values of wide features for train/validation/test sets using the utility function defined previously

tr_wide_features = get_wide_features(tr_df, selected_categories_to_idx, top_combinations)
val_wide_features = get_wide_features(val_df, selected_categories_to_idx, top_combinations)
te_wide_features = get_wide_features(te_df, selected_categories_to_idx, top_combinations)

category_binary_features shape: (60080, 501)
category_cross_features shape: (60080, 100)
wide features shape: (60080, 601)
category_binary_features shape: (7510, 501)
category_cross_features shape: (7510, 100)
wide features shape: (7510, 601)
category_binary_features shape: (7510, 501)
category_cross_features shape: (7510, 100)
wide features shape: (7510, 601)


### Build the input list for each of the train/validation/test sets through aggregating all continuous, deep categorical and wide features


In [42]:
tr_df.head()

Unnamed: 0,user_id,business_id,stars,index,user_Unnamed: 0,user_name,user_review_count,user_yelping_since,user_useful,user_funny,user_cool,user_elite,user_fans,user_average_stars,user_compliment_hot,user_compliment_more,user_compliment_profile,user_compliment_cute,user_compliment_list,user_compliment_note,user_compliment_plain,user_compliment_cool,user_compliment_funny,user_compliment_writer,user_compliment_photos,item_Unnamed: 0,item_name,item_address,item_city,item_state,item_postal_code,item_latitude,item_longitude,item_stars,item_review_count,item_is_open,item_attributes,item_categories,item_hours,GoodForMeal_latenight,GoodForMeal_lunch,GoodForMeal_dinner,RestaurantsGoodForGroups,BusinessAcceptsCreditCards,GoodForKids,Corkage,RestaurantsTakeOut,BusinessParking_lot,HasTV,BestNights_friday,BestNights_wednesday,BestNights_saturday,BikeParking,RestaurantsTableService,Ambience_divey,RestaurantsReservations,RestaurantsDelivery,Caters,Ambience_casual,WheelchairAccessible,BusinessParking_garage,BusinessParking_valet,Ambience_romantic,OutdoorSeating,Ambience_trendy,BusinessParking_street,HappyHour,GoodForMeal_dessert,GoodForMeal_brunch,GoodForMeal_breakfast,Ambience_hipster,Open24Hours,DogsAllowed,BestNights_tuesday,Music_background_music,BestNights_sunday,BestNights_thursday,DriveThru,Ambience_classy,Music_live,Music_dj,BusinessParking_validated,Ambience_upscale,BestNights_monday,ByAppointmentOnly,CoatCheck,GoodForDancing,Ambience_intimate,Ambience_touristy,BYOBCorkage_Nan,BYOBCorkage_yes_corkage,BYOBCorkage_yes_free,RestaurantsPriceRange2_1.0,RestaurantsPriceRange2_2.0,RestaurantsPriceRange2_3.0,RestaurantsPriceRange2_4.0,RestaurantsPriceRange2_Nan,RestaurantsAttire_Nan,RestaurantsAttire_casual,RestaurantsAttire_dressy,Alcohol_Nan,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,NoiseLevel_Nan,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,WiFi_Nan,WiFi_free,WiFi_paid,Smoking_1,Smoking_Nan,Smoking_outdoor,AgesAllowed_21plus,AgesAllowed_Nan,AgesAllowed_allages
0,11d83531dcee4af6f5701696ff45a8d8,54eb8c224740ce5a8a1db56324cbf0c0,3.0,0,1666,Jen,75,2007-07-19 00:56:23,150,83,56,,2,3.25,0,0,0,0,0,4,5,2,2,0,1,5188,Four Kegs Sports Bar,"276 N Jones Blvd, Ste B",Las Vegas,NV,89107,36.175904,-115.223262,4.0,441,1,"{'OutdoorSeating': 'False', 'GoodForMeal': ""{'...","American (New), American (Traditional), Sports...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
1,f3270a0bfa9f67f4b00fd9dadbf1d3e4,e398e51ecca29473c80b058ab17e903e,5.0,1,298,Matthew,430,2014-05-27 06:33:22,1713,466,1155,201620172018.0,73,4.58,26,3,1,0,0,11,47,67,67,21,30,4590,Thai Spices,"66 S Dobson Rd, Ste 133",Mesa,AZ,85202,33.412708,-111.875803,4.0,343,1,"{'DogsAllowed': 'False', 'Alcohol': ""u'full_ba...","Thai, Restaurants","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3...",0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
2,8515478d756cecae1d46ad274e583fba,62d5a2614594cdfe014cb3f47bc1f183,4.0,2,847,Danielle,66,2013-09-27 21:02:56,186,50,98,2015.0,16,4.03,12,1,0,0,0,7,15,6,6,9,1,3033,Lakeside,3131 Las Vegas Blvd S,Las Vegas,NV,89109,36.126576,-115.166935,3.5,450,1,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Restaurants, Steakhouses, American (New), Seafood","{'Monday': '17:30-22:0', 'Tuesday': '17:30-22:...",0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
3,f7a8d4e39387479bee5786d9af7a384e,5e4ad90afb13df5004803bd165a905a9,5.0,3,1153,Lana,247,2013-07-20 22:03:06,363,145,223,2015201620172018.0,9,3.88,2,0,1,1,0,4,10,10,10,6,1,5268,Saku Sushi,478 Queen Street W,Toronto,ON,M5V 2B2,43.648086,-79.400362,4.0,401,1,"{'HasTV': 'False', 'RestaurantsReservations': ...","Breakfast & Brunch, Sushi Bars, Japanese, Rest...","{'Monday': '12:0-22:30', 'Tuesday': '12:0-23:3...",0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
4,edbc6b267c51aaa5f89ec35237126649,cff931388a0021c662fa8d3437448a89,2.0,4,368,G,113,2012-04-11 07:50:10,53,11,17,,0,3.68,0,0,0,0,0,2,0,0,0,0,1,4636,Dave & Buster's,"2130 Park Centre Dr, Ste 100",Las Vegas,NV,89135,36.148748,-115.332187,3.0,473,1,"{'Caters': 'False', 'RestaurantsTakeOut': 'Tru...","Restaurants, Arts & Entertainment, Sports Bars...","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0


In [104]:
# tr_features = [tr_continuous_features,categorical_features_0,categorical_features_1,categorical_features_2,tr_wide_features]
tr_features = []
tr_features.append(tr_continuous_features)
tr_features += [tr_deep_categorical_features[:,i] for i in range(tr_deep_categorical_features.shape[1])]
tr_features.append(tr_wide_features)

val_features = []
val_features.append(val_continuous_features)
val_features += [val_deep_categorical_features[:,i] for i in range(val_deep_categorical_features.shape[1])]
val_features.append(val_wide_features)

te_features = []
te_features.append(te_continuous_features)
te_features += [te_deep_categorical_features[:,i] for i in range(te_deep_categorical_features.shape[1])]
te_features.append(te_wide_features)

In [105]:
item_deep_vocab_lens

[118,
 799,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14]

### Build the WDL model defined above

In [106]:
wdl_model = build_wdl_model(
        len(tr_continuous_features[0]),
        item_deep_vocab_lens,   # num of category classes
        len(tr_wide_features[0]), 
        embed_size=100)
#print(len(tr_continuous_features[0]))
#print(item_deep_vocab_lens)
#print(len(tr_wide_features[0]))

### Train the model using Adagrad optimizer and mean squared error loss

In [107]:
wdl_model.compile(optimizer='adagrad', loss='mse')

history = wdl_model.fit(
        tr_features, 
        tr_ratings, 
        epochs=1, verbose=1, callbacks=[ModelCheckpoint('model.h5')])



### Evaluate the model on train and validation sets using RMSE¶

In [108]:
y_pred = wdl_model.predict(tr_features)
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = wdl_model.predict(val_features)
print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  1.0450630633659304


ValueError: in user code:

    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1478 predict_function  *
        return step_function(self, iterator)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1468 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1461 run_step  **
        outputs = model.predict_step(data)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1434 predict_step
        return self(x, training=False)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:207 assert_input_compatibility
        ' input tensors. Inputs received: ' + str(inputs))

    ValueError: Layer model_3 expects 84 input(s), but it received 5 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 11) dtype=float32>, <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'IteratorGetNext:4' shape=(None, 601) dtype=int64>]
