Let's add code to recommend restaurants to users based on predicted ratings

### Implementing Restaurant Recommendation System

# Now let's convert our rating prediction model into an actual restaurant recommendation system

In [1]:
import pandas as pd
from data_preprocessing import load_and_preprocess_data
from models.two_tower_model import build_two_tower_model
from models.baselines import matrix_factorization, item_based_cf
from evaluation import evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from itertools import product
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_large_json(file_path, chunksize=10000):
    """Load large JSON files in chunks to avoid memory issues."""
    print(f"Loading {file_path} in chunks...")
    chunks = []
    for chunk in pd.read_json(file_path, lines=True, chunksize=chunksize):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

In [7]:
business_data = load_large_json('../data/business.json')
user_data = load_large_json('../data/user.json')
tip_data = load_large_json('../data/tip.json')
checkin_data = load_large_json('../data/checkin.json')
review_data = load_large_json('../data/review.json')

Loading ../data/business.json in chunks...
Loading ../data/user.json in chunks...
Loading ../data/tip.json in chunks...
Loading ../data/checkin.json in chunks...
Loading ../data/review.json in chunks...


In [8]:
train_data, restaurant_features, user_df, category_names = load_and_preprocess_data(
        business_data, user_data, review_data, tip_data, checkin_data, max_categories=50
    )

In [9]:
import os
import pickle
def save_preprocessed_data(data, file_path):
    """
    Save preprocessed data to disk using pickle.
    
    Args:
        data: The preprocessed data to save.
        file_path: The file path where the data should be saved.
    """
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data saved to {file_path}")
    
def load_preprocessed_data(file_path):
    """
    Load preprocessed data from disk if it exists.
    
    Args:
        file_path: The file path from where the data should be loaded.
        
    Returns:
        The loaded data if the file exists, otherwise None.
    """
    if os.path.exists(file_path):
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
        print(f"Data loaded from {file_path}")
        return data
    else:
        print(f"No preprocessed data found at {file_path}")
        return None

In [10]:
preprocessed_file_path = "../data/preprocessed_data.pkl"

In [11]:

preprocessed_data = {
        "train_data": train_data,
        "restaurant_features": restaurant_features,
        "user_df": user_df,
        "category_names": category_names,
        "business_data": business_data,
        "user_data": user_data,
        "tip_data": tip_data,
        "checkin_data": checkin_data,
        "review_data": review_data,
    }

save_preprocessed_data(preprocessed_data, preprocessed_file_path)

Data saved to ../data/preprocessed_data.pkl


In [12]:
preprocessed_data = load_preprocessed_data(preprocessed_file_path)
train_data = preprocessed_data["train_data"]
restaurant_features = preprocessed_data["restaurant_features"]
user_df = preprocessed_data["user_df"]
category_names = preprocessed_data["category_names"]

Data loaded from ../data/preprocessed_data.pkl


In [13]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[]


In [13]:
train, test = train_test_split(train_data, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [14]:
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)
val.to_csv('../data/val.csv', index=False)

In [15]:
train

Unnamed: 0,user_id,business_id,stars,review_count_norm_x,average_stars_norm,elite_binary,fans_norm,friends_count_norm,stars_norm,review_count_norm_y,...,cat_45,cat_46,cat_47,cat_48,cat_49,park_garage,park_street,park_validated,park_lot,park_valet
4670133,DkME5tzZe25XUUhaudGtdw,l-rGtJt0E7PAklT0IK7oFQ,1,0.031557,-1.546892,0,-0.025688,-0.013186,1.187133,2.226831,...,0,0,0,0,1,False,False,False,False,True
3961966,nL4VRVgSpOiV4xrGGHdUpA,uxZENO1iy7tz8MK77L3_GQ,4,-0.113779,0.253103,0,-0.025688,-0.361066,0.584422,0.972451,...,0,1,0,1,1,False,True,False,False,False
2324996,EviLk1SQozBvNMKOQvhJwg,So20ueL9qAIpV6FIwUtlKA,5,-0.101668,0.785496,0,-0.080843,-0.320139,1.789844,-0.329564,...,0,0,0,1,1,False,False,False,True,False
4292971,DE-idFvMJ_6cjChcTFa3JQ,oHvEgLH6pAkcrPmeR1l3UQ,5,-0.150113,0.616482,0,-0.080843,-0.354245,1.187133,3.084256,...,1,0,1,0,1,False,True,False,False,False
4103590,1urqOIfpbmKhga_JHND8Sw,Cmw00BFD1l-_DJHPuKi2Rw,5,-0.247004,1.157326,0,-0.080843,0.246019,0.584422,2.290344,...,1,0,1,0,1,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2826792,tBo5BDh-uybm7ZZ9PUfBUA,ws2gnWU77nydKJB81GXT9g,1,-0.259116,0.033385,0,-0.080843,4.400116,0.584422,1.713435,...,0,0,0,0,1,False,False,False,True,False
947791,Bw3zquF7auR_yBx60HWVTQ,GXjN3c0swlIfH5kbVxATFQ,5,-0.247004,1.157326,0,-0.080843,-0.279212,1.187133,0.670765,...,0,0,0,0,1,True,True,False,True,False
1472514,LqNqJ_uW_2w1wIBmTsuaFg,TMBbCDQG6bENJoQSST0XiA,5,-0.138002,0.244652,0,-0.025688,0.662111,1.187133,1.311187,...,0,0,0,0,1,False,True,False,True,False
4454082,xZjKpD0wtlI0uUP0srdu7w,ZNOc9LErlBDtJF3Y7LQtwA,5,-0.162225,-1.039851,0,-0.080843,-0.361066,0.584422,0.766034,...,0,1,0,1,1,False,True,False,True,False


In [19]:
def create_dataset(data):
    # User features
    user_features = {
        'review_count': data['review_count_norm_x'].values.astype(np.float32),
        'average_stars': data['average_stars_norm'].values.astype(np.float32),
        'fans': data['fans_norm'].values.astype(np.float32),
        'friends_count': data['friends_count_norm'].values.astype(np.float32),
        'elite': data['elite_binary'].values.astype(np.float32)
    }

    # Restaurant features
    rest_features = {
        # 'stars': data['stars_norm'].values.astype(np.float32),
        'review_count': data['review_count_norm_y'].values.astype(np.float32),
        'lat': data['lat_norm'].values.astype(np.float32),
        'lon': data['lon_norm'].values.astype(np.float32),
        'categories': data[[f'cat_{i}' for i in range(50)]].values.astype(np.float32),
        'parking': data[['park_garage', 'park_street', 'park_validated', 'park_lot', 'park_valet']].values.astype(np.float32)
    }

    # Labels (target variable)
    labels = data['stars'].values.astype(np.float32)

    return user_features, rest_features, labels

In [20]:
train_user, train_rest, train_labels = create_dataset(train)
val_user, val_rest, val_labels = create_dataset(val)
test_user, test_rest, test_labels = create_dataset(test)

In [21]:
print("Hyperparameter tuning for Two-Tower Model...")
# Hyperparameter tuning
param_grid = {
    'embedding_dim': [64],
    'learning_rate': [0.001],
    'batch_size': [64]
}

Hyperparameter tuning for Two-Tower Model...


In [23]:
for params in product(*param_grid.values()):
    emb_dim, lr, batch = params
    print(f"Testing params: emb_dim={emb_dim}, lr={lr}, batch={batch}")
    
    model = build_two_tower_model(
        user_feature_dim=4,
        rest_feature_dim=4+5,
        category_dim=len(category_names),
        embedding_dim=emb_dim
    )
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                    loss='mse',
                    metrics=['mae'])
    
    history = model.fit(
        [train_user, train_rest], train_labels,
        validation_data=([val_user, val_rest], val_labels),
        epochs=3,
        batch_size=batch,
        verbose=0
    )
    print(f"Training completed for params: emb_dim={emb_dim}, lr={lr}, batch={batch}")
    
    val_pred = model.predict([val_user, val_rest], batch_size=batch, verbose=0)
    val_rmse = np.sqrt(mean_squared_error(val_labels, val_pred))
    
    # if val_rmse < best_rmse:
    best_rmse = val_rmse
    best_params = {'embedding_dim': emb_dim, 'learning_rate': lr, 'batch_size': batch}
    best_model = model
    
    tf.keras.backend.clear_session()

print(f"Best params: {best_params}, Validation RMSE: {best_rmse:.4f}")

print("Evaluating best model on test set...")
# Evaluate best model
test_pred = best_model.predict([test_user, test_rest], batch_size=best_params['batch_size'], verbose=0)
evaluate_model(test_labels, test_pred.flatten(), "Two-Tower Model")
print("-------------------------------------------------------------------")
    

Testing params: emb_dim=64, lr=0.001, batch=64

Best params: {'embedding_dim': 64, 'learning_rate': 0.001, 'batch_size': 64}, Validation RMSE: 1.1289
Evaluating best model on test set...
Two-Tower Model - RMSE: 1.1299, MAE: 0.8620, Precision@5: 0.3889
-------------------------------------------------------------------


In [24]:
print(f"Best params: {best_params}, Validation RMSE: {best_rmse:.4f}")
print("Evaluating best model on test set...")
# Evaluate best model
test_pred = best_model.predict([test_user, test_rest], batch_size=best_params['batch_size'], verbose=0)
evaluate_model(test_labels, test_pred.flatten(), "Two-Tower Model")
print("-------------------------------------------------------------------")
print("Evaluating baseline models...")
print("Evaluating Matrix Factorization...")
predict_mf, user_map, item_map = matrix_factorization(train_data)
mf_pred = [predict_mf(row['user_id'], row['business_id'], user_map, item_map) for _, row in test.iterrows()]
evaluate_model(test_labels, mf_pred, "Matrix Factorization")

# print("Evaluating Item-Based CF...")
# # Baseline: Item-Based CF
# predict_icf, item_similarities = item_based_cf(train)
# icf_pred = [predict_icf(row['user_id'], row['business_id'], train, item_similarities) for _, row in test.iterrows()]
# evaluate_model(test_labels, icf_pred, "Item-Based CF")

print("-------------------------------------------------------------------")

Best params: {'embedding_dim': 64, 'learning_rate': 0.001, 'batch_size': 64}, Validation RMSE: 1.1289
Evaluating best model on test set...
Two-Tower Model - RMSE: 1.1299, MAE: 0.8620, Precision@5: 0.3889
-------------------------------------------------------------------
Evaluating baseline models...
Evaluating Matrix Factorization...


  check_blas_config()
100%|██████████| 20/20 [02:27<00:00,  7.36s/it]


Matrix Factorization - RMSE: 3.9281, MAE: 3.6789, Precision@5: 0.0000
-------------------------------------------------------------------


In [42]:
# predict_mf, user_map, item_map = matrix_factorization(train_data)
# mf_pred = [predict_mf(row['user_id'], row['business_id'], user_map, item_map) for _, row in test.iterrows()]
# evaluate_model(test_labels, mf_pred, "Matrix Factorization")

# # print("Evaluating Item-Based CF...")
# # # Baseline: Item-Based CF
# # predict_icf, item_similarities = item_based_cf(train)
# # icf_pred = [predict_icf(row['user_id'], row['business_id'], train, item_similarities) for _, row in test.iterrows()]
# # evaluate_model(test_labels, icf_pred, "Item-Based CF")

print("-------------------------------------------------------------------")
print("Training completed for all models.")
print("Model saved as 'two_tower_model.h5'")
# Save model
best_model.save('two_tower_model.h5')



-------------------------------------------------------------------
Training completed for all models.
Model saved as 'two_tower_model.h5'


In [25]:
def generate_recommendations(model, user_id, top_n=10):
    """
    Generate restaurant recommendations for a specific user
    
    Args:
        model: The trained two-tower model
        user_id: The user ID to generate recommendations for
        top_n: Number of recommendations to return
        
    Returns:
        DataFrame containing top-n restaurant recommendations for the user
    """
    # Get user features
    user_info = user_df[user_df['user_id'] == user_id]
    if len(user_info) == 0:
        print(f"User ID {user_id} not found in the dataset.")
        return None
    
    # Get user feature values
    user_features = {
        'review_count': np.array([user_info['review_count_norm'].values[0]], dtype=np.float32),
        'average_stars': np.array([user_info['average_stars_norm'].values[0]], dtype=np.float32),
        'fans': np.array([user_info['fans_norm'].values[0]], dtype=np.float32),
        'friends_count': np.array([user_info['friends_count_norm'].values[0]], dtype=np.float32),
        'elite': np.array([user_info['elite_binary'].values[0]], dtype=np.float32)
    }
    
    # Get restaurants already visited by the user
    visited_restaurants = set(train_data[train_data['user_id'] == user_id]['business_id'].unique())
    
    # Create list of all restaurants not visited by the user
    all_restaurants = set(restaurant_features['business_id'].unique())
    unvisited_restaurants = all_restaurants - visited_restaurants
    
    # If there are too many restaurants, sample a subset for efficiency
    if len(unvisited_restaurants) > 1000:
        unvisited_restaurants = set(np.random.choice(list(unvisited_restaurants), 1000, replace=False))
    
    # Extract restaurant features for prediction
    restaurant_data = restaurant_features[restaurant_features['business_id'].isin(unvisited_restaurants)]
    
    # Prepare prediction inputs
    predictions = []
    
    # Process in batches for efficiency
    batch_size = 100
    for i in range(0, len(restaurant_data), batch_size):
        batch = restaurant_data.iloc[i:i+batch_size]
        
        # Create restaurant feature batch
        rest_features_batch = {
            'review_count': batch['review_count_norm'].values.astype(np.float32),
            'lat': batch['lat_norm'].values.astype(np.float32),
            'lon': batch['lon_norm'].values.astype(np.float32),
            'categories': batch[[f'cat_{i}' for i in range(50)]].values.astype(np.float32),
            'parking': batch[['park_garage', 'park_street', 'park_validated', 'park_lot', 'park_valet']].values.astype(np.float32)
        }
        
        # Create user features batch (same user repeated)
        user_features_batch = {k: np.repeat(v, len(batch), axis=0) for k, v in user_features.items()}
        
        # Predict ratings
        predicted_ratings = model.predict([user_features_batch, rest_features_batch], verbose=0)
        
        # Store results
        for j, idx in enumerate(batch.index):
            predictions.append({
                'business_id': batch.iloc[j]['business_id'],
                'predicted_rating': predicted_ratings[j][0]
            })
    
    # Convert to DataFrame and sort by predicted rating
    recommendations_df = pd.DataFrame(predictions)
    recommendations_df = recommendations_df.sort_values('predicted_rating', ascending=False).head(top_n)
    
    # Get restaurant details from business_data
    business_details = preprocessed_data["business_data"][
        preprocessed_data["business_data"]['business_id'].isin(recommendations_df['business_id'])
    ][['business_id', 'name', 'address', 'city', 'stars', 'review_count', 'categories']]
    
    # Merge with recommendations
    final_recommendations = recommendations_df.merge(business_details, on='business_id')
    
    return final_recommendations[['business_id', 'name', 'address', 'city', 'stars', 'review_count', 'categories', 'predicted_rating']]

In [26]:
# Let's demonstrate the recommendation system by generating recommendations for a sample user
# First, let's find a user with a good number of reviews to use as an example

# Get users who have reviewed several restaurants
active_users = train_data['user_id'].value_counts().reset_index()
active_users.columns = ['user_id', 'review_count']
active_users = active_users[active_users['review_count'] > 10].sort_values('review_count', ascending=False)

if len(active_users) > 0:
    # Pick one of the most active users
    sample_user_id = active_users.iloc[0]['user_id']
    print(f"Generating recommendations for user: {sample_user_id} who has {active_users.iloc[0]['review_count']} reviews")
    
    # Generate recommendations
    recommendations = generate_recommendations(best_model, sample_user_id, top_n=10)
    
    # Display recommendations
    if recommendations is not None:
        display(recommendations)
    else:
        print("Could not generate recommendations for this user.")
else:
    print("No active users found in the dataset.")

Generating recommendations for user: _BcWyKQL16ndpBdggh2kNA who has 1704 reviews


Unnamed: 0,business_id,name,address,city,stars,review_count,categories,predicted_rating
0,61wOYrhTgEf-Xjif7lCREQ,Cafe Breve,120 S Central Ave Lobby,Clayton,4.5,7,"Coffee & Tea, Food, Ice Cream & Frozen Yogurt,...",4.537729
1,5lUtEhbOrvBARVNI-YLgLA,Meat Wagon BBQ,1697 Bethlehem Pike,Hatfield,4.5,65,"Festivals, Arts & Entertainment, Restaurants, ...",4.458914
2,LeLs-93CZezcvx3O607Dgw,MELT Food Truck,5707 W Emerald St,Boise,4.0,13,"Food Trucks, Event Planning & Services, Catere...",4.366618
3,OTjS4wcvaabAzpfkwVzwxw,PDQ Test Kitchen,"4343 Anchor Plaza Pkwy, Ste 100",Tampa,4.0,28,"Sandwiches, Restaurants, American (Traditional...",4.279662
4,Ty1hqO9LZ4CjRdIA_i-1ig,Formaggi Pizza,"483 Mandalay Ave, Ste 106",Clearwater,4.0,71,"Restaurants, Nightlife, Italian, Bars, Pizza, ...",4.230304
5,SwdHTo4A_5eA8QsfelTPdg,Food for All Market,7127 Germantown Ave,Philadelphia,4.5,13,"Health Markets, Specialty Food, Restaurants, B...",4.229907
6,EAtdpksdWfWAaqh3XdnNNg,Essene Market & Café,719 S 4th St,Philadelphia,3.5,149,"Restaurants, Food, Vegetarian, Organic Stores,...",4.21413
7,jBCgQ6nZDsMCWU7eaSxOpw,Whaley's Blazin BBQ,18213 N US Highway 41,Lutz,4.5,58,"Sandwiches, Restaurants, Barbeque, Food Trucks...",4.206501
8,BzUTIP9r911o3VQAHOYizg,Chuck E. Cheese,1540 W Brandon Blvd,Brandon,2.5,32,"Pizza, Restaurants, Event Planning & Services,...",4.199856
9,Pr5TeLRBEc59FlRF_Rc7Vw,Mangos Refresqueria Y Cafe,4990 S Campbell Ave,Tucson,3.5,52,"Restaurants, Juice Bars & Smoothies, Cafes, Fo...",4.193605
