In [408]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split


In [409]:
reviews = pd.read_parquet("../data/processed_review_data.parquet")
restaurants = pd.read_csv("../data/processed_restaurant_data.csv")
restaurants_tp = pd.read_csv("../data/yelp_restaurant_data.csv")
restaurants = restaurants.merge(
                    restaurants_tp[[
                    "id", "transactions", "price"
                    ]], on="id", how="left")
restaurants.rename(columns={"id": "restaurant_id"}, inplace=True)
TRANSACTION_TYPES = ['pickup', 'delivery', 'restaurant_reservation']
for t in TRANSACTION_TYPES:
    restaurants[f'is_{t}'] = restaurants['transactions'].apply(lambda lst: 1 if t in lst else 0)
PRICE_MAP = {'$': 1, '$$': 2, '$$$': 3, '$$$$':4}
restaurants['price_cat'] = restaurants['price'].map(PRICE_MAP).fillna(0).astype(int)

In [410]:
import ast
from sklearn.preprocessing import MultiLabelBinarizer

# turn string type into list
def safe_parse_categories(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except:
        return []
# flatten dicts in list into list
def extract_titles(category_list):
    if not isinstance(category_list, list):
        return []
    return [d.get("title") for d in category_list if isinstance(d, dict) and "title" in d]

restaurants['parsed_categories'] = restaurants['categories'].apply(safe_parse_categories)
restaurants['category_titles'] = restaurants['parsed_categories'].apply(extract_titles)


In [411]:
reviews = reviews.rename(columns={"normalized_rating":"normalized_rating_reviews"})

In [412]:
restaurants = restaurants.drop(columns=['Unnamed: 0','name', 'categories', 'rating', 'review_count', 'categories_list','log_review_count','wilson_score', 'normalized_latitude', 'normalized_longitude', 'transactions', 'price'])

In [413]:
df = reviews.merge(
    restaurants,
    on="restaurant_id", how="left"
)

In [None]:

df_sorted = df.sort_values(['user_id','time_created'])

test_idx   = df_sorted.groupby('user_id').tail(1).index
train_pool = df.index.difference(test_idx)

train_rest_ids = df.loc[train_pool, 'restaurant_id'].unique()
train_restaurants = restaurants[restaurants['restaurant_id'].isin(train_rest_ids)]

all_categories = set(
    c for lst in train_restaurants['category_titles']
       for c in (lst or []) if isinstance(lst, list)
)
category_vocab = {cat: idx+1 for idx, cat in enumerate(sorted(all_categories))}
category_vocab_size = len(category_vocab) + 1
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_CATEGORIES = 5
def map_to_ids_train(lst):
    return [category_vocab.get(c,0) for c in (lst or [])][:MAX_CATEGORIES]

restaurants['category_ids'] = restaurants['category_titles'].apply(map_to_ids_train)
restaurants['category_ids_padded'] = pad_sequences(
    restaurants['category_ids'], maxlen=MAX_CATEGORIES, padding='post'
).tolist()

In [415]:
df = reviews.merge(
    restaurants,
    on="restaurant_id", how="left"
)

In [416]:
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['mapped_sentiment_num'] = df['mapped_sentiment'].map(sentiment_map)

In [417]:
EMBED_DIM = 8
DNN_UNITS = [64, 32]
DROPOUT_RATE = 0.3
USER_EMB_DIM = 4
PRICE_EMB_DIM = 4
CATEGORY_EMB_DIM = 8


In [418]:
user_encoder = LabelEncoder()
user_encoder.fit(df.loc[train_pool, 'user_id'])
df.loc[train_pool, 'user_id_encoded'] = user_encoder.fit_transform(
         df.loc[train_pool,'user_id'])
df['user_id_encoded'] = df['user_id'].map(
         lambda u: user_encoder.transform([u])[0]
                  if u in user_encoder.classes_ else 0)
user_vocab_size = len(user_encoder.classes_) + 1

price_vocab_size = 5

numeric_cols = ['normalized_rating_restaurants','normalized_log_review_count',
       'popularity_score','normalized_wilson_score','is_pickup', 
       'is_delivery', 'is_restaurant_reservation'] 
feature_cols = {
    'user_id': df['user_id_encoded'].values,
    'price_cat': df['price_cat'].values,
    'category_ids': np.stack(df['category_ids_padded'].values)
}


In [419]:
# input layer
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten, Dropout, Reshape
user_id_in = Input(shape=(1,), name='user_id', dtype='int32')
price_in = Input(shape=(1,), name='price_cat', dtype='int32')
category_in = Input(shape=(MAX_CATEGORIES,), name='category_ids', dtype='int32')
numeric_inputs = [Input(shape=(1,), name=col, dtype='float32') for col in numeric_cols]


In [420]:
# embedding # dense
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Concatenate, Flatten, Dropout,
    Reshape, Lambda
)

user_emb = Embedding(user_vocab_size, USER_EMB_DIM)(user_id_in)
price_emb = Embedding(price_vocab_size, PRICE_EMB_DIM)(price_in)

cat_emb = Embedding(category_vocab_size, CATEGORY_EMB_DIM)(category_in)  
cat_emb_pooled = Lambda(
    lambda x: K.mean(x, axis=1, keepdims=True),
    output_shape=(1, EMBED_DIM)
)(cat_emb)
UNIFIED_EMB_DIM = 8
user_emb = Dense(UNIFIED_EMB_DIM)(user_emb)
price_emb = Dense(UNIFIED_EMB_DIM)(price_emb)

num_embs = []
for inp in numeric_inputs:
    x = Reshape((1,1))(inp)
    x = Dense(EMBED_DIM,activation='relu')(x)
    num_embs.append(x)

In [421]:
# FM
all_embs = [user_emb, price_emb, cat_emb_pooled] + num_embs
fm_input = Concatenate(axis=1)(all_embs)

sum_sq = Lambda(lambda x: K.square(K.sum(x, axis=1)), output_shape=(EMBED_DIM,))(fm_input)
sq_sum = Lambda(lambda x: K.sum(K.square(x), axis=1), output_shape=(EMBED_DIM,))(fm_input)
cross = Lambda(lambda t: 0.5 * K.sum(t[0] - t[1], axis=1, keepdims=True), output_shape=(1,))([sum_sq, sq_sum])

In [422]:
# flattening for dense 96->64->32
deep_x = Flatten()(fm_input)
for n in DNN_UNITS:
    deep_x = Dense(n, activation='relu')(deep_x)
    deep_x = Dropout(DROPOUT_RATE)(deep_x) #0.3
deep_out = Dense(1)(deep_x)

In [423]:
# concat FM and deep output
logits = Concatenate(axis=1)([cross, deep_out])
# output = Dense(1, activation='sigmoid')(logits)
output = Dense(1, activation='linear', name='pred_normalized_rating')(logits)


In [424]:
from tensorflow.keras.models import Model
model = Model(inputs=[user_id_in, price_in, category_in] + numeric_inputs, outputs=output)

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

In [425]:
X = {
    'user_id'      : df['user_id_encoded'].values.astype('int32'),
    'price_cat'    : df['price_cat'].values.astype('int32'),
    'category_ids' : np.stack(df['category_ids_padded'].values).astype('int32')
}
for col in numeric_cols:
    X[col] = df[col].values.astype('float32')

In [426]:
df_sorted = df.sort_values(['user_id','time_created'])
# most recent review of each user as testing set
test_idx  = df_sorted.groupby('user_id').tail(1).index
train_pool = df.index.difference(test_idx)

# make sure a user wont be in val and train set at the same time
from sklearn.model_selection import GroupShuffleSplit
# 0.1 of train as val
gss = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
train_idx, val_idx = next(gss.split(df.loc[train_pool],
                                    groups=df.loc[train_pool,'user_id']))

train_X = {k: v[train_idx] for k,v in X.items()}
val_X   = {k: v[val_idx]   for k,v in X.items()}
train_y = df.loc[train_idx,'normalized_rating_reviews'].values.astype('float32').reshape(-1,1)
val_y   = df.loc[val_idx,  'normalized_rating_reviews'].values.astype('float32').reshape(-1,1)
test_df = df.loc[test_idx]

In [427]:
from tensorflow.keras import backend as K
K.clear_session()
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(train_X, train_y, validation_data=(val_X,val_y),
          batch_size=256, epochs=12)


Epoch 1/12
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.9507 - mae: 0.8416 - val_loss: 0.1262 - val_mae: 0.2557
Epoch 2/12
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.2062 - mae: 0.3619 - val_loss: 0.0964 - val_mae: 0.2682
Epoch 3/12
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1472 - mae: 0.3163 - val_loss: 0.0902 - val_mae: 0.2653
Epoch 4/12
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1230 - mae: 0.2921 - val_loss: 0.0768 - val_mae: 0.2357
Epoch 5/12
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1027 - mae: 0.2648 - val_loss: 0.0767 - val_mae: 0.2410
Epoch 6/12
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1075 - mae: 0.2708 - val_loss: 0.0710 - val_mae: 0.2270
Epoch 7/12
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0881 

<keras.src.callbacks.history.History at 0x300390970>

In [428]:
def recommend_topk(model, user_id, pref_categories=None,
                   price_cat=None, is_delivery=None, is_pickup=None,
                   top_k=10, category_vocab=None, MAX_CATEGORIES=5):

    cand = restaurants.copy()

    # filter restaurants based on user perference
    if pref_categories is not None:
        cand = cand[cand['category_titles'].apply(
            lambda lst: any(c in lst for c in pref_categories))]
    if price_cat is not None:
        cand = cand[cand['price_cat'] == price_cat]
    if is_delivery is not None:
        cand = cand[cand['is_delivery'] == int(is_delivery)]
    if is_pickup is not None:
        cand = cand[cand['is_pickup'] == int(is_pickup)]
    if cand.empty:
        return pd.DataFrame()
    # leave-one-out for evaluation, exclude interacted restaurants in train
    user_hist = df.loc[train_idx]
    user_hist = user_hist[user_hist['user_id'] == user_id]
    seen = set(user_hist['restaurant_id'])
    cand = cand[~cand['restaurant_id'].isin(seen)]
    if cand.empty:
        return pd.DataFrame()


    user_id_encoded = user_encoder.transform([user_id])[0] \
                      if user_id in user_encoder.classes_ else 0

    # turn input categoty into id and cut to 5 ids
    cand['category_ids_pad'] = cand['category_titles'].apply(
        lambda lst: [category_vocab.get(c, 0) for c in lst][:MAX_CATEGORIES]
    )
    # make up to 5 id with 0 if len<5
    cand['category_ids_pad'] = cand['category_ids_pad'].apply(
        lambda x: x[:MAX_CATEGORIES] + [0]*(MAX_CATEGORIES - len(x))
    )
    # handle situations unexpected situation like cand is empty
    try:
        category_ids = np.vstack(cand['category_ids_pad'].values).astype('int32')
    except ValueError as e:
        print(f"Error stacking categories: {str(e)}, cand shape: {cand.shape}")
        return pd.DataFrame()
    model_input = {
        'user_id'   : np.full(len(cand), user_id_encoded, dtype='int32'),
        'price_cat' : cand['price_cat'].values.astype('int32'),
        'category_ids': category_ids
    }
    for col in numeric_cols:
        model_input[col] = cand[col].values.astype('float32')

    # predictions
    cand['predicted_rating'] = model.predict(model_input, verbose=0).ravel()
    return (cand.sort_values('predicted_rating', ascending=False)
                .head(top_k)
                [['restaurant_id','category_titles','price_cat',
                  'predicted_rating']])

In [429]:
top10 = recommend_topk(model, user_id='tgeFUChlh7v8bZFVl2-hjQ',
                       pref_categories=['Bars'], top_k=10,
                       category_vocab=category_vocab
                       )

print(top10)

               restaurant_id       category_titles  price_cat  \
5720  onlTYPgNFt7aXLs-cWTvIg                [Bars]          2   
316   fvmGh1NssZR5v7Ursn5vSA                [Bars]          0   
742   f8SAw1edzvnkSzPAUl80DA                [Bars]          1   
793   Ec8LO9yop4QPLiF1husTZQ  [Bars, Distilleries]          0   
6227  UVRkAcABnSxOIOpKXkdWAQ     [Mini Golf, Bars]          2   
5691  QHx-ZSbJ20FBW7QqEzCWhg                [Bars]          1   
2843  gdv8xA0rLs6Y916yf7E08A                [Bars]          0   
5173  FAvyjIekpR1C40aA6AI2hQ                [Bars]          0   
6373  91L8OPJNUyNauXzjDQRurg      [Bars, American]          2   
1046  awJjRw8x6ZP_26SvIejbxw                [Bars]          1   

      predicted_rating  
5720          0.758076  
316           0.752578  
742           0.739279  
793           0.714094  
6227          0.708841  
5691          0.697914  
2843          0.693627  
5173          0.691747  
6373          0.684909  
1046          0.674374  


In [None]:
# hit rate
# for every user, does our recommender system recommend 
from tqdm import tqdm

def evaluate_hit_rate(model, test_df, top_k=10):
    hit_count = 0
    total = 0

    for u in tqdm(test_df['user_id'].unique(), desc="HitRate@K"):
        # last interacted restaurant
        record = test_df[test_df['user_id']==u].iloc[0]
        true_r = record['restaurant_id']
        # top 10 recommendation
        recs = recommend_topk(
            model, u,
            pref_categories=None,
            
            top_k=top_k, category_vocab=category_vocab
        )
        if recs.empty:
            continue
        total += 1
        if true_r in recs['restaurant_id'].values:
            hit_count += 1

    return hit_count / total if total>0 else 0.0


In [None]:
# category coverage rate
def evaluate_category_coverage(model, test_df, top_k=10, category_vocab=None):

    train_categories = set(category_vocab.keys())
    total_cats = len(train_categories)

    seen_cats = set()
    for u in test_df['user_id'].unique():
        recs = recommend_topk(
            model, u,
            pref_categories=None,
            top_k=top_k, category_vocab=category_vocab
        )
        if recs.empty:
            continue

        for cats in recs['category_titles']:
            for c in cats:
                if c in train_categories:
                    seen_cats.add(c)

    return len(seen_cats) / total_cats if total_cats > 0 else 0.0

In [432]:
from tqdm import tqdm
hit_rate_10 = evaluate_hit_rate(model, test_df, top_k=10)
print(f"Hit Rate@10: {hit_rate_10:.4f}")


HitRate@K: 100%|██████████| 9493/9493 [30:11<00:00,  5.24it/s]  

Hit Rate@10: 0.0022





In [434]:
coverage = evaluate_category_coverage(
    model,
    test_df,
    top_k=10,
    category_vocab=category_vocab
)
print(f"Category Coverage: {coverage:.4f}")

Category Coverage: 0.0334
