In [36]:
import json
import pandas as pd
from yelp_recommendation_system import YelpRecommenderSystem
from yelp_recommendation_system.evaluation import *
from yelp_recommendation_system.plotting import *
from yelp_recommendation_system.data_utils import *
# from yelp_recommendation_system.data_utils import remove_rows_from_restaurant_if_contains_categories


#### Load businesses and filter to restaurants

In [2]:
businesses = []
with open('../example_usage/data/yelp_illinois_business_filtered.json', 'r', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line.strip())
        businesses.append(business)
print("Number of Business: ", len(businesses))

Number of Business:  2145


In [3]:
restaurants = []
for b in businesses:
    if  b['categories'] is not None and ('food' in b['categories'].lower() or 'restaurant' in b['categories'].lower()):
        restaurants.append(b)
print("Number of Restaurants: ", len(restaurants))

Number of Restaurants:  1164


#### Load users and reviews

In [4]:
users = []
with open('../example_usage/data/yelp_illinois_restaurant_users_filtered.json', 'r', encoding='utf-8') as f:
    for line in f:
        user = json.loads(line.strip())
        users.append(user)
print("Number of Users: ", len(users))

Number of Users:  2157


In [5]:
reviews = []
with open('../example_usage/data/yelp_illinois_restaurant_reviews_filtered.json', 'r', encoding='utf-8') as f:
    for line in f:
        review = json.loads(line.strip())
        reviews.append(review)

#### Filter reviews to only include those for the restaurants

In [6]:
rest_ids = set([r['business_id'] for r in restaurants])
restaurant_reviews = [r for r in reviews if r['business_id'] in rest_ids]
print("Number of Restaurant Reviews: ", len(restaurant_reviews))

Number of Restaurant Reviews:  7611


## Classifying users location

In [7]:
clsfd_reviews = classify_all_reviews(restaurant_reviews, restaurants)

In [8]:
users = add_base_city_to_users(users, clsfd_reviews)
restaurant_reviews = add_classification_to_reviews(restaurant_reviews, clsfd_reviews)

### VADER Sentiment Score Calculations

In [9]:
# Execute this cell if vaderSentiment is not installed
# !pip install vaderSentiment

In [10]:
# If not previously imported, import the function to analyze reviews with VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [11]:
### Updated function to analyze reviews with VADER in a subset of data, originl function is designed to handle much larger files
def analyze_reviews_with_vader(reviews, output_path=None, text_field='text'):
    """
    Analyze reviews with VADER sentiment and preserve review_id.

    Args:
        reviews: list of review dicts
        output_path: Path to save CSV with sentiment scores
        text_field: Name of field containing review text
    """
    analyzer = SentimentIntensityAnalyzer()

    result = pd.DataFrame(reviews)

    result = _add_vader_scores(result, analyzer, text_field)
    if output_path:
        result.to_csv(output_path, index=False)
    print(f"Saved {len(result)} reviews with sentiment scores to {output_path}")
    return result

def _add_vader_scores(df, analyzer, text_field):
    """Add VADER sentiment scores to dataframe."""
    scores = df[text_field].apply(lambda x: analyzer.polarity_scores(str(x)) if pd.notna(x) else {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0})

    df['vader_neg'] = scores.apply(lambda x: x['neg'])
    df['vader_neu'] = scores.apply(lambda x: x['neu'])
    df['vader_pos'] = scores.apply(lambda x: x['pos'])
    df['vader_compound'] = scores.apply(lambda x: x['compound'])

    return df


#### Apply VADER to restaurant reviews

In [12]:
reviews = analyze_reviews_with_vader(restaurant_reviews, output_path='../example_save_data/yelp_illinois_restaurant_reviews_with_vader.csv', text_field='text')


Saved 7611 reviews with sentiment scores to ../example_usage/data/yelp_illinois_restaurant_reviews_with_vader.csv


# Feature Engineering/Extraction

In [13]:
filtered_restaurants = remove_rows_from_restaurant_if_contains_categories(restaurants)


Number of restaurants before category filtering:  1164
Number of restaurants after category filtering:  1088
Total number removed:  76


## Add attributes to restaurants

In [14]:
filtered_restaurants = add_attributes_to_restaurants(filtered_restaurants, attribute_keys=['DogsAllowed', 'Music' 'OutdoorSeating', 'RestaurantsPriceRange2', 'BusinessParking' 'Ambience', 'Open24Hours'])

In [15]:
rdf = pd.DataFrame(filtered_restaurants)
bids = set(rdf['business_id'].tolist())
reviews = reviews[reviews['business_id'].isin(bids)]

# Adding new business features

In [16]:
rdf = get_business_sentiment_scores(reviews, rdf)
rdf = impute_values_for_cols(rdf)

In [17]:
# LPR
rdf = get_local_pref_ratio(reviews, rdf, users)

# Create users dataframe and add features to restaurants and users

In [20]:
rdf.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'has_american',
       'has_european_food', 'RestaurantsPriceRange2', 'has_seafood',
       'has_asian_food', 'DogsAllowed', 'serves_alcohol', 'has_coffee_or_tea',
       'has_vegetarian', 'has_hispanic_food', 'serves_sweets',
       'has_entertainment', 'Open24Hours', 'avg_sentiment',
       'local_preference_ratio'],
      dtype='object')

In [18]:
udf = get_users_df_with_agged_sentiment(reviews, users)

In [21]:
rdf, udf, reviews = get_elite_users_features(reviews, udf, rdf)

Retrieved elite reviews...
Calculated elite user ratios...


In [22]:
udf = get_ratios(rdf, reviews, udf)

In [23]:
udf = get_user_rating_variance(reviews, udf)

In [24]:
udf = get_user_locality_pref_ratio(reviews, udf)

In [25]:
udf, reviews = get_date_features(reviews, udf)

In [26]:
udf = get_price_pref(rdf, reviews, udf)

In [27]:
udf = get_geo_diversity(reviews, udf)

#### Save to a local path and keep track of the path to load into the rec sys


In [28]:
udf_path = '../example_save_data/yelp_illinois_restaurant_users_features.csv'
rdf_path = '../example_save_data/yelp_illinois_restaurant_features.csv'
rev_path = '../example_save_data/yelp_illinois_restaurant_reviews_with_vader.csv'
udf.to_csv(udf_path, index=False)
rdf.to_csv(rdf_path, index=False)

# Load rec sys object and train the model

In [30]:
# Initialize the system
system = YelpRecommenderSystem()

# Load your data
system.load_data(
    user_path=udf_path,
    restaurant_path=rdf_path,
    reviews_path=rev_path)

# Train the model
history = system.train(
    n_factors=100,      # Number of latent factors
    n_epochs=10,        # Training iterations
    learning_rate=0.005,
    alpha=0.001,        # User features weight
    beta=0.001          # Business features weight
)

Loading data...
Data loaded successfully.
Elapsed time [Getting data]: 0m 0.11s

STEP 1: Filtering Data
Filtering data...
Data filtered successfully.

After filtering (>=20 user reviews, >=50 business reviews):
Users: 4
Businesses: 34
Reviews: 88
Sparsity: 0.3529
Elapsed time [Filtering review data]: 0m 0.00s

STEP 2: Preprocessing Business Features

Filtered businesses: 34
Any NaNs in business_features: False
Elapsed time [Preprocessing business features]: 0m 0.01s

STEP 3: Preprocessing User Features
User features shape: (4, 41)
Any NaNs in user_features_prepared: False
Columns: ['user_id', 'review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos', 'avg_sentiment', 'top_category_ratio', 'has_american_ratio', 'has_asian_food_ratio', 'has_hispanic_food_ratio', 'ha

                                                

Elapsed time [fit recommender]: 0m 0.06s

STEP 6: Evaluating Model

Model Evaluation Results

Prediction Error:
  RMSE: 1.0942
  MAE:  0.8344

Ranking Metrics:
  Precision@ 5: 0.7000  |  Recall@ 5: 0.6667
  Precision@10: 0.7000  |  Recall@10: 0.6667
  Precision@20: 0.7000  |  Recall@20: 0.6667
  Precision@30: 0.7000  |  Recall@30: 0.6667
  Precision@50: 0.7000  |  Recall@50: 0.6667





# Save the model and evaluate

In [31]:
# Save the trained model
system.save_model('../example_usage/example_recommender.pkl')

Model saved to ../example_usage/example_recommender.pkl


In [32]:
# evaluate testset
preds = system.model.test(system.testset)

## Get metrics after evaluation

In [34]:
metrics = evaluate_model(preds)
print_evaluation_results(metrics)


Model Evaluation Results

Prediction Error:
  RMSE: 1.0942
  MAE:  0.8344

Ranking Metrics:
  Precision@ 5: 0.7000  |  Recall@ 5: 0.6667
  Precision@10: 0.7000  |  Recall@10: 0.6667
  Precision@20: 0.7000  |  Recall@20: 0.6667
  Precision@30: 0.7000  |  Recall@30: 0.6667
  Precision@50: 0.7000  |  Recall@50: 0.6667



# Get recommendations for a few users

In [35]:
# check available users in the model
trainset = system.model.trainset
all_users = [trainset.to_raw_uid(i) for i in range(trainset.n_users)]
print(f"Total users in model: {len(all_users)}")

# Get recommendations for multiple users
for user_id in all_users[:5]:
    recs = system.get_recommendations(
        user_id=user_id,
        n_recommendations=5,
        min_rating=3.5
    )
    print(f"\nTop 5 recommendations for {user_id}:")
    print(recs[['name', 'city', 'predicted_rating', 'stars']])

Total users in model: 4

Top 5 recommendations for VxQdmAO6lghp7_ZG0hpojA:
                        name          city  predicted_rating  stars
0  Zapp Thai Noodle & Market      O'Fallon          3.598158    4.0
1            Wang Gang Asian  Edwardsville          3.572894    3.5
2   Casa Maria Mexican Grill    Belleville          3.567566    4.5

Top 5 recommendations for hKBQ-PFlcB-t5FK3HUxoyQ:
                     name          city  predicted_rating  stars
0          Oriental Spoon  Edwardsville          3.986922    4.5
1  Bobby's Frozen Custard     Maryville          3.978873    4.5
2            Bella Milano  Edwardsville          3.970337    4.0
3         Texas Roadhouse        Shiloh          3.958892    3.5
4         Wang Gang Asian  Edwardsville          3.872394    3.5

Top 5 recommendations for IcowqfUjDSmdwtT0vp4pRg:
                     name          city  predicted_rating  stars
0          Oriental Spoon  Edwardsville          3.766825    4.5
1            Bella Milano  Edwa

### Load Saved Model and Get Recommendations

In [40]:
model = YelpRecommenderSystem()
model.load_model('../example_usage/example_recommender.pkl')
mloaded_recs = model.get_recommendations(
    user_id=all_users[1],
    n_recommendations=5,
    min_rating=3.5)

mloaded_recs

Model loaded from ../example_usage/example_recommender.pkl
Loaded model bundle with all supporting data


Unnamed: 0,user_id,business_id,predicted_rating,name,city,state,stars,categories
0,hKBQ-PFlcB-t5FK3HUxoyQ,EvBeuDww_OCNQZ-dRy6qzA,3.986922,Oriental Spoon,Edwardsville,IL,4.5,"Restaurants, Korean"
1,hKBQ-PFlcB-t5FK3HUxoyQ,PUDDBW0MfpSFI2XEMv_g3Q,3.978873,Bobby's Frozen Custard,Maryville,IL,4.5,"Food, Restaurants, Bakeries, Ice Cream & Froze..."
2,hKBQ-PFlcB-t5FK3HUxoyQ,iGdpNaFSHjjwEuUvMykm1w,3.970337,Bella Milano,Edwardsville,IL,4.0,"Caterers, Restaurants, Italian, Seafood, Event..."
3,hKBQ-PFlcB-t5FK3HUxoyQ,pO1bjUULFCGIo9O2S28yoA,3.958892,Texas Roadhouse,Shiloh,IL,3.5,"American (Traditional), Barbeque, Restaurants,..."
4,hKBQ-PFlcB-t5FK3HUxoyQ,gXg277YblY5xsaeMzJ73TA,3.872394,Wang Gang Asian,Edwardsville,IL,3.5,"Asian Fusion, Restaurants"
