In [1]:
import pandas as pd
from data_preprocessing import load_and_preprocess_data
from models.two_tower_model import build_two_tower_model
from models.baselines import matrix_factorization, item_based_cf
from evaluation import evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from itertools import product
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!pip install tqdm
from tqdm.notebook import tqdm




[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
def load_large_json(file_path, chunksize=10000):
    """Load large JSON files in chunks to avoid memory issues."""
    print(f"Loading {file_path} in chunks...")
    chunks = []
    for chunk in pd.read_json(file_path, lines=True, chunksize=chunksize):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

In [6]:
business_data = load_large_json('../data/business.json')
user_data = load_large_json('../data/user.json')
tip_data = load_large_json('../data/tip.json')
checkin_data = load_large_json('../data/checkin.json')
review_data = load_large_json('../data/review.json')

Loading ../data/business.json in chunks...
Loading ../data/user.json in chunks...
Loading ../data/tip.json in chunks...
Loading ../data/checkin.json in chunks...
Loading ../data/review.json in chunks...


In [7]:
train_data, restaurant_features, user_df, category_names = load_and_preprocess_data(
        business_data, user_data, review_data, tip_data, checkin_data, max_categories=50
    )

In [8]:
import os
import pickle
def save_preprocessed_data(data, file_path):
    """
    Save preprocessed data to disk using pickle.
    
    Args:
        data: The preprocessed data to save.
        file_path: The file path where the data should be saved.
    """
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data saved to {file_path}")
    
def load_preprocessed_data(file_path):
    """
    Load preprocessed data from disk if it exists.
    
    Args:
        file_path: The file path from where the data should be loaded.
        
    Returns:
        The loaded data if the file exists, otherwise None.
    """
    if os.path.exists(file_path):
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
        print(f"Data loaded from {file_path}")
        return data
    else:
        print(f"No preprocessed data found at {file_path}")
        return None

In [None]:
preprocessed_file_path = "../data/preprocessed_data.pkl"
preprocessed_data = {
        "train_data": train_data,
        "restaurant_features": restaurant_features,
        "user_df": user_df,
        "category_names": category_names,
        "business_data": business_data,
        "user_data": user_data,
        "tip_data": tip_data,
        "checkin_data": checkin_data,
        "review_data": review_data,
    }

save_preprocessed_data(preprocessed_data, preprocessed_file_path)

Data saved to ../data/preprocessed_data.pkl


In [10]:
preprocessed_data = load_preprocessed_data(preprocessed_file_path)
train_data = preprocessed_data["train_data"]
restaurant_features = preprocessed_data["restaurant_features"]
user_df = preprocessed_data["user_df"]
category_names = preprocessed_data["category_names"]

Data loaded from ../data/preprocessed_data.pkl


In [13]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[]


In [14]:
train, test = train_test_split(train_data, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [16]:
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)
val.to_csv('../data/val.csv', index=False)

In [15]:
train

Unnamed: 0,user_id,business_id,stars,review_count_norm_x,average_stars_norm,elite_binary,fans_norm,friends_count_norm,stars_norm,review_count_norm_y,...,cat_45,cat_46,cat_47,cat_48,cat_49,park_garage,park_street,park_validated,park_lot,park_valet
4670133,DkME5tzZe25XUUhaudGtdw,l-rGtJt0E7PAklT0IK7oFQ,1,0.031557,-1.546892,0,-0.025688,-0.013186,1.187133,2.226831,...,0,0,0,0,1,False,False,False,False,True
3961966,nL4VRVgSpOiV4xrGGHdUpA,uxZENO1iy7tz8MK77L3_GQ,4,-0.113779,0.253103,0,-0.025688,-0.361066,0.584422,0.972451,...,0,1,0,1,1,False,True,False,False,False
2324996,EviLk1SQozBvNMKOQvhJwg,So20ueL9qAIpV6FIwUtlKA,5,-0.101668,0.785496,0,-0.080843,-0.320139,1.789844,-0.329564,...,0,0,0,1,1,False,False,False,True,False
4292971,DE-idFvMJ_6cjChcTFa3JQ,oHvEgLH6pAkcrPmeR1l3UQ,5,-0.150113,0.616482,0,-0.080843,-0.354245,1.187133,3.084256,...,1,0,1,0,1,False,True,False,False,False
4103590,1urqOIfpbmKhga_JHND8Sw,Cmw00BFD1l-_DJHPuKi2Rw,5,-0.247004,1.157326,0,-0.080843,0.246019,0.584422,2.290344,...,1,0,1,0,1,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2826792,tBo5BDh-uybm7ZZ9PUfBUA,ws2gnWU77nydKJB81GXT9g,1,-0.259116,0.033385,0,-0.080843,4.400116,0.584422,1.713435,...,0,0,0,0,1,False,False,False,True,False
947791,Bw3zquF7auR_yBx60HWVTQ,GXjN3c0swlIfH5kbVxATFQ,5,-0.247004,1.157326,0,-0.080843,-0.279212,1.187133,0.670765,...,0,0,0,0,1,True,True,False,True,False
1472514,LqNqJ_uW_2w1wIBmTsuaFg,TMBbCDQG6bENJoQSST0XiA,5,-0.138002,0.244652,0,-0.025688,0.662111,1.187133,1.311187,...,0,0,0,0,1,False,True,False,True,False
4454082,xZjKpD0wtlI0uUP0srdu7w,ZNOc9LErlBDtJF3Y7LQtwA,5,-0.162225,-1.039851,0,-0.080843,-0.361066,0.584422,0.766034,...,0,1,0,1,1,False,True,False,True,False


In [25]:
def create_dataset(data):
    # User features
    user_features = {
        'review_count': data['review_count_norm_x'].values.astype(np.float32),
        'average_stars': data['average_stars_norm'].values.astype(np.float32),
        'fans': data['fans_norm'].values.astype(np.float32),
        'friends_count': data['friends_count_norm'].values.astype(np.float32),
        'elite': data['elite_binary'].values.astype(np.float32)
    }

    # Restaurant features
    rest_features = {
        'stars': data['stars_norm'].values.astype(np.float32),
        'review_count': data['review_count_norm_y'].values.astype(np.float32),
        'lat': data['lat_norm'].values.astype(np.float32),
        'lon': data['lon_norm'].values.astype(np.float32),
        'categories': data[[f'cat_{i}' for i in range(50)]].values.astype(np.float32),
        'parking': data[['park_garage', 'park_street', 'park_validated', 'park_lot', 'park_valet']].values.astype(np.float32)
    }

    # Labels (target variable)
    labels = data['stars'].values.astype(np.float32)

    return user_features, rest_features, labels

In [26]:
train_user, train_rest, train_labels = create_dataset(train)
val_user, val_rest, val_labels = create_dataset(val)
test_user, test_rest, test_labels = create_dataset(test)

In [19]:
param_grid = {
        'embedding_dim': [32, 64],
        'learning_rate': [0.001, 0.0001],
        'batch_size': [32, 64]
    }
best_rmse = float('inf')
best_params = {}
best_model = None

In [27]:
# print(train_user)
print(train_rest)
# print(train_labels)

{'stars': array([ 1.1871328 ,  0.58442223,  1.7898436 , ...,  1.1871328 ,
        0.58442223, -1.2237097 ], dtype=float32), 'review_count': array([ 2.2268314 ,  0.97245103, -0.32956406, ...,  1.3111867 ,
        0.766034  ,  1.7504845 ], dtype=float32), 'lat': array([-0.42923838,  0.48785722, -0.8383215 , ..., -1.1731241 ,
       -0.78424597,  0.49068806], dtype=float32), 'lon': array([-2.3024716 ,  0.9180817 , -1.6731824 , ..., -0.16596775,
       -1.6750513 ,  0.91780776], dtype=float32), 'categories': array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 1., 0., 1.]], dtype=float32), 'parking': array([[0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)}


In [None]:
for params in product(*param_grid.values()):
    emb_dim, lr, batch = params
    print(f"Testing params: emb_dim={emb_dim}, lr={lr}, batch={batch}")
    
    model = build_two_tower_model(
        user_feature_dim=4,
        rest_feature_dim=4+5,
        category_dim=len(category_names),
        embedding_dim=emb_dim
    )
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                    loss='mse',
                    metrics=['mae'])
    
    history = model.fit(
        [train_user, train_rest], train_labels,
        validation_data=([val_user, val_rest], val_labels),
        epochs=10,
        batch_size=batch,
        verbose=0
    )
    print(f"Training completed for params: emb_dim={emb_dim}, lr={lr}, batch={batch}")
    
    val_pred = model.predict([val_user, val_rest], batch_size=batch, verbose=0)
    val_rmse = np.sqrt(mean_squared_error(val_labels, val_pred))
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_params = {'embedding_dim': emb_dim, 'learning_rate': lr, 'batch_size': batch}
        best_model = model
    
    tf.keras.backend.clear_session()

Testing params: emb_dim=32, lr=0.001, batch=32
Training completed for params: emb_dim=32, lr=0.001, batch=32


Exception ignored in: <function tqdm.__del__ at 0x000001D88CCF0F40>
Traceback (most recent call last):
  File "e:\Masters\CMPE256-project\Restaurant_Recommendation\recom_env\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "e:\Masters\CMPE256-project\Restaurant_Recommendation\recom_env\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'



Testing params: emb_dim=32, lr=0.001, batch=64


In [None]:
print(f"Best params: {best_params}, Validation RMSE: {best_rmse:.4f}")
print("Evaluating best model on test set...")
# Evaluate best model
test_pred = best_model.predict([test_user, test_rest], batch_size=best_params['batch_size'], verbose=0)
evaluate_model(test_labels, test_pred.flatten(), "Two-Tower Model")
print("-------------------------------------------------------------------")
print("Evaluating baseline models...")
print("Evaluating Matrix Factorization...")

In [None]:
predict_mf, user_map, item_map = matrix_factorization(train_data)
mf_pred = [predict_mf(row['user_id'], row['business_id'], user_map, item_map) for _, row in test.iterrows()]
evaluate_model(test_labels, mf_pred, "Matrix Factorization")

print("Evaluating Item-Based CF...")
# Baseline: Item-Based CF
predict_icf, item_similarities = item_based_cf(train)
icf_pred = [predict_icf(row['user_id'], row['business_id'], train, item_similarities) for _, row in test.iterrows()]
evaluate_model(test_labels, icf_pred, "Item-Based CF")

print("-------------------------------------------------------------------")
print("Training completed for all models.")
print("Model saved as 'two_tower_model.h5'")
# Save model
best_model.save('two_tower_model.h5')