In [1]:
import json
import pandas as pd
from collections import defaultdict, Counter
from typing import List, Tuple, Dict, Optional

#### This was used to infer user states based on their base_city using restaurant data

In [6]:
def infer_user_states(users: List[dict], restaurants: Optional[List[dict]] = None,
                    restaurants_filepath: str = '../yelp_dataset/yelp_restaurants.json',
                    overwrite_state_code: bool = False) -> Tuple[List[dict], Dict[str,str]]:
    """
    For each user with a `base_city`, set `user['inferred_state']` and populate
    `user['state_code']` when missing (unless overwrite_state_code is True).
    Returns (updated_users, city_state_map).
    """
    if restaurants is None:
        with open(restaurants_filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
            restaurants = data.get('restaurants', data)  # support either shape

    city_state_map = build_city_state_map(restaurants)

    for u in users:
        base_city = u.get('base_city')
        if not base_city:
            continue
        key = base_city.strip().lower()
        state = city_state_map.get(key)
        if state:
            u['inferred_state'] = state
            if overwrite_state_code or not u.get('state_code'):
                u['state_code'] = state

    return users, city_state_map



In [7]:
# restaurants = get_restaurants()  # existing helper or let infer_user_states load from `../yelp_dataset/yelp_restaurants.json`
# users_updated, mapping = infer_user_states(users, restaurants)

In [8]:
# save updated users back to a file if needed
def save_users(users: List[dict], filepath: str = '../yelp_dataset/yelp_users_with_base_city.json'):
    with open(filepath, 'w', encoding='utf-8') as f:
        for user in users:
            f.write(json.dumps(user) + '\n')
# save_users(users_updated)

### Code to add num checkins and num tips to restaurant data


In [9]:
def add_num_tips_to_restaurants(
    restaurants: List[dict],
    tips_filepath: str = '../yelp_dataset/yelp_academic_dataset_tip.json') -> List[dict]:
    """
    Adds 'num_tips' field to each restaurant dict based on tips data.
    """
    tip_counts = Counter()
    with open(tips_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            tip = json.loads(line.strip())
            business_id = tip.get('business_id')
            if business_id:
                tip_counts[business_id] += 1

    for r in restaurants:
        business_id = r.get('business_id')
        r['num_tips'] = tip_counts.get(business_id, 0)

    return restaurants

In [10]:
# restaurants_updated = add_num_tips_to_restaurants(restaurants)

#### Code to add num checkins to restaurant data

In [11]:
def add_num_checkins_to_restaurants(
    restaurants: List[dict],
    checkins_filepath: str = '../yelp_dataset/yelp_academic_dataset_checkin.json') -> List[dict]:
    """
    Adds 'num_checkins' field to each restaurant dict based on checkins data.
    """
    checkin_counts = {}
    with open(checkins_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            checkin = json.loads(line.strip())
            business_id = checkin.get('business_id')
            num_checkins = len(checkin["date"].split(","))
            if business_id:
                checkin_counts[business_id] = num_checkins

    for r in restaurants:
        business_id = r.get('business_id')
        r['num_checkins'] = checkin_counts.get(business_id, 0)

    return restaurants

In [12]:
# restaurants_updated = add_num_checkins_to_restaurants(restaurants)

In [19]:
len(restaurants_updated)

64629

In [13]:
def save_restaurants(restaurants: List[dict], filepath: str = '../yelp_dataset/yelp_restaurants.json'):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump({'restaurants': restaurants}, f, indent=2)

In [14]:
# save_restaurants(restaurants_updated)

In [None]:
# Add more features to business and user json
# Aggregate sentiment scores at business and user level
# Extract categories from businesses and create user categegory preferences
# Calculate local_preference_ratio for businesses with reviews file
# Add most frequent review day_of_week and hour_of_day to users based on reviews file


#### Basically this helps build the mapping, the only reason for this is because there are some weird data points where the location data is weird

In [2]:
def build_city_state_map(restaurants: List[dict]) -> Dict[str, str]:
    """
    Return a mapping normalized_city -> most_common_state_code
    """
    counts = defaultdict(Counter)
    for r in restaurants:
        city = r.get('city')
        state = r.get('state') or r.get('state_code')
        if not city or not state:
            continue
        key = city.strip().lower()
        counts[key][state.strip()] += 1
    return {city: counter.most_common(1)[0][0] for city, counter in counts.items()}



In [34]:
def get_users(filepath: str = '../yelp_dataset/yelp_users_with_base_city.json') -> list:
    users = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            user = json.loads(line.strip())
            users.append(user)
    return users
users = get_users()

In [2]:
def get_restaurants(filepath: str = '../yelp_dataset/yelp_restaurants.json') -> list:
    with open(filepath, 'r', encoding='utf-8') as f:
        restaurants = json.load(f)
    restaurants = restaurants['restaurants']
    return restaurants

In [3]:
restaurants = get_restaurants()

In [4]:
restaurants[0]

{'business_id': 'MTSW4McQd7CbVtyjqoe9mw',
 'name': 'St Honore Pastries',
 'address': '935 Race St',
 'city': 'Philadelphia',
 'state': 'PA',
 'postal_code': '19107',
 'latitude': 39.9555052,
 'longitude': -75.1555641,
 'stars': 4.0,
 'review_count': 80,
 'is_open': 1,
 'attributes': {'RestaurantsDelivery': 'False',
  'OutdoorSeating': 'False',
  'BusinessAcceptsCreditCards': 'False',
  'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}",
  'BikeParking': 'True',
  'RestaurantsPriceRange2': '1',
  'RestaurantsTakeOut': 'True',
  'ByAppointmentOnly': 'False',
  'WiFi': "u'free'",
  'Alcohol': "u'none'",
  'Caters': 'True'},
 'categories': 'Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries',
 'hours': {'Monday': '7:0-20:0',
  'Tuesday': '7:0-20:0',
  'Wednesday': '7:0-20:0',
  'Thursday': '7:0-20:0',
  'Friday': '7:0-21:0',
  'Saturday': '7:0-21:0',
  'Sunday': '7:0-21:0'},
 'num_tips': 10,
 'num_checkins': 335}

#### Get category counts across all restaurants

In [12]:
# split each category for each business and get counts for categories
counts_dict = Counter()
for r in restaurants:
    cats = r.get('categories', [])
    cats = cats.split(', ') if isinstance(cats, str) else cats
    for cat in cats:
        counts_dict[cat.strip().lower()] += 1
# print top 30 most common categories
to = counts_dict.most_common(300)

In [13]:
to

[('restaurants', 52268),
 ('food', 27781),
 ('nightlife', 9326),
 ('bars', 8872),
 ('sandwiches', 8366),
 ('american (traditional)', 8139),
 ('pizza', 7093),
 ('coffee & tea', 6703),
 ('fast food', 6472),
 ('breakfast & brunch', 6239),
 ('american (new)', 6097),
 ('burgers', 5636),
 ('mexican', 4600),
 ('italian', 4573),
 ('specialty food', 4233),
 ('event planning & services', 3607),
 ('seafood', 3539),
 ('shopping', 3367),
 ('desserts', 3186),
 ('chinese', 3169),
 ('bakeries', 3150),
 ('grocery', 3139),
 ('salad', 3064),
 ('chicken wings', 2966),
 ('cafes', 2756),
 ('ice cream & frozen yogurt', 2657),
 ('beer', 2413),
 ('wine & spirits', 2413),
 ('delis', 2393),
 ('caterers', 2336),
 ('convenience stores', 1919),
 ('japanese', 1830),
 ('sports bars', 1818),
 ('sushi bars', 1717),
 ('barbeque', 1694),
 ('juice bars & smoothies', 1684),
 ('arts & entertainment', 1563),
 ('asian fusion', 1547),
 ('steakhouses', 1506),
 ('pubs', 1503),
 ('diners', 1494),
 ('cocktail bars', 1476),
 ('drug

#### This is used to create a variety features downstream. Some are business features and some are user features. Take a look at the mapping keys, and the associated lists, these are the categories that will be used to create boolean features for businesses and ratio features for users

In [64]:
categories_to_remove = [
    'automotive', 'gas stations', 'fashion', 'hotels & travel', 'cosmetics & beauty supply', 'home & garden',
    'photography stores & services', 'electronics', 'furniture stores', 'tobacco shops', 'education',
    'home services', 'mobile phones', 'home decor', 'bookstores', 'appliances', 'hardware stores', 'vape shops',
    'head shops', 'religious organizations' 'party supplies', 'public services & government', 'medical centers',
    'community service/non-profit', 'tires', 'sporting goods', 'eyewear & opticians', 'hobby shops', 'nurseries & gardening', 'fitness & instruction', 'real estate', 'financial services', 'lawyers', 'insurance', 'shipping centers', 'professional services', 'shopping centers', 'food court', 'wholesale stores', 'dance clubs', 'hookah bars', 'massage' 'local services', 'international grocery', 'farmers market', 'department stores', 'beauty & spas', 'health & medical', 'convenience stores',]


def create_common_mappings():
    category_mappings = {
        'has_american': ['american (traditional)', 'american (new)', 'diners', 'steakhouses', 'hot dogs', 'comfort food', 'southern', 'bbq', 'new american', 'breakfast & brunch', 'buffets', 'chicken wings', 'cajun/creole', 'gastropubs', 'meat shops', 'food stands', 'food trucks'],

        'has_asian_food': ['japanese', 'chinese', 'sushi bars', 'asian fusion', 'korean', 'indian', 'vietnamese', 'thai', 'laotian', 'malaysian',
                           'mongolian', 'taiwanese', 'cantonese', 'dim sum', 'hot pot', 'filipino', 'singaporean'
                           'szechuan', 'ramen'],
        'has_hispanic_food': ['mexican', 'tex-mex', 'spanish', 'puerto rican', 'cuban', 'colombian', 'salvadoran', 'argentine', 'peruvian', 'brazilian', 'venezuelan', 'dominican', 'guatemalan', 'honduran', 'empanadas' 'latin american', 'new mexican cuisine'],

        'has_european_food': ['italian', 'french', 'german', 'greek', 'mediterranean', 'spanish', 'portuguese', 'austrian', 'belgian', 'hungarian', 'irish', 'scandinavian', 'swiss', 'russian', 'polish', 'pasta', 'modern european'],

        'has_seafood': ['seafood', 'seafood markets', 'fish & chips', 'poke'],
        'serves_alcohol': ['beer, wine & spirits', 'pubs', 'cocktail bars', 'wine bars', 'beer bar', 'breweries', 'wineries', 'brewpubs'
                           'beer tours', 'wine tours', 'distilleries', 'whiskey bars', 'wine tasting classes', 'irish pub',
                           ],
        'has_vegetarian': ['vegan', 'vegetarian'],
        'has_entertainment': ['arts & entertainment', 'music venues', 'karaoke', 'dinner theatre', 'piano bars', ],
        'has_coffee_or_tea': ['coffee & tea', 'bubble tea', 'tea rooms', 'coffee roasteries', 'tea rooms'],
        'serves_sweets': ['desserts', 'ice cream & frozen yogurt', 'bakeries', 'donuts', 'cupcakes', 'chocolatiers & shops', 'shaved_ice', 'gelato', 'creperies', 'patisserie/cake shop', 'candy stores', 'custom cakes'],
    }
    return category_mappings





#### Some "restaurants" (businesses) have categories that are not relevant to food or dining, so we will remove those restaurants from the dataset

In [None]:
def remove_rows_from_restaurant_if_contains_categories(restaurants, categories_to_remove: List[str]) -> List[dict]:
    """
    Returns restaurants with rows removed if they contain any of the categories in categories_to_remove
    """
    print("Number of restaurants before category filtering: ", len(restaurants))
    filtered_restaurants = []
    mappings = create_common_mappings()

    # get cats for each restaurant and check if any are in categories_to_remove
    for r in restaurants:
        cats = r.get('categories', [])
        cats = cats.split(', ') if isinstance(cats, str) else cats
        cats = [cat.strip().lower() for cat in cats]

        # check if any category in cats is in categories_to_remove if no, then we will continue logic abd eventualy append it
        if not any(cat in categories_to_remove for cat in cats):
            for c in cats:
                for key in mappings.keys():
                    if c in mappings[key]:
                        r[key] = True
            filtered_restaurants.append(r)
    print("Number of restaurants after category filtering: ", len(filtered_restaurants))
    print("Total number removed: ", len(restaurants) - len(filtered_restaurants))
    return filtered_restaurants


# cats_to_track = ['has_nightlife', 'serves_alcohol', 'has_sandwiches', 'has_american', 'has_pizza', 'has_coffee_or_tea', 'has_breakfast_and_brunch', 'has_burgers', 'has_mexican', 'has_italian', 'has_chinese', 'has_japanese', 'has_seafood', 'has_vegetarian', 'has_vegan']

In [17]:
# 64629 original len
filtered_restaurants = remove_rows_from_restaurant_if_contains_categories(restaurants, categories_to_remove)

Number of restaurants before category filtering:  64629
Number of restaurants after category filtering:  59248
Total number removed:  5381


#### Checking what attributes are available in the attributes field of the restaurant data

In [19]:
# get features from attributes field
all_attr_keys = set()
for r in filtered_restaurants:
    attrs = r.get('attributes', {})
    if attrs:
        for key in attrs.keys():
            all_attr_keys.add(key)

all_attr_keys

{'AcceptsInsurance',
 'AgesAllowed',
 'Alcohol',
 'Ambience',
 'BYOB',
 'BYOBCorkage',
 'BestNights',
 'BikeParking',
 'BusinessAcceptsBitcoin',
 'BusinessAcceptsCreditCards',
 'BusinessParking',
 'ByAppointmentOnly',
 'Caters',
 'CoatCheck',
 'Corkage',
 'DietaryRestrictions',
 'DogsAllowed',
 'DriveThru',
 'GoodForDancing',
 'GoodForKids',
 'GoodForMeal',
 'HappyHour',
 'HasTV',
 'Music',
 'NoiseLevel',
 'Open24Hours',
 'OutdoorSeating',
 'RestaurantsAttire',
 'RestaurantsCounterService',
 'RestaurantsDelivery',
 'RestaurantsGoodForGroups',
 'RestaurantsPriceRange2',
 'RestaurantsReservations',
 'RestaurantsTableService',
 'RestaurantsTakeOut',
 'Smoking',
 'WheelchairAccessible',
 'WiFi'}

In [None]:
# Maybe track: ['DogsAllowed', 'Music' 'OutdoorSeating', 'RestaurantsPriceRange2', 'BusinessParking' 'Ambience', 'Open24Hours']

In [None]:
# extract these values from attributes  and r

#### Extracts specific attributes from the attribute field, and adds them as top-level fields in the restaurant dict (it will later be used as a feature)

In [20]:
# add attributes as fied to restaurant dict
def add_attributes_to_restaurants(restaurants: List [dict], attribute_keys: List[str]):
    for r in restaurants:
        attrs = r.get('attributes', {})
        if attrs:
            for key in attribute_keys:
                value = attrs.get(key)
                if value is not None:
                    # convert string "True"/"False" to boolean
                    if isinstance(value, str):
                        if value.lower() == 'true' or 'true' in value.lower():
                            r[key] = True
                        elif value.lower() == 'false' or 'false' in value.lower():
                            r[key] = False
                        else:
                            r[key] = value
                    else:
                        r[key] = value
    return restaurants

filtered_restaurants = add_attributes_to_restaurants(filtered_restaurants, attribute_keys=['DogsAllowed', 'Music' 'OutdoorSeating', 'RestaurantsPriceRange2', 'BusinessParking' 'Ambience', 'Open24Hours'])

### Adding sentiment data from reviews to restaurant data

In [21]:
reviews = pd.read_csv('../yelp_dataset/yelp_restaurant_reviews_with_vader.csv')

In [29]:
rdf = pd.DataFrame(filtered_restaurants)
bids = set(rdf['business_id'].tolist())
reviews = reviews[reviews['business_id'].isin(bids)]
# reviews.to_csv('../yelp_dataset/yelp_restaurant_reviews_with_vader.csv', index=False)

#### AGgregate sentiment scores at restaurant level

In [30]:
# agg mean compound sentiment scores at restaurant level
business_sentiment_summary = reviews.groupby('business_id').agg({'vader_compound': 'mean',}).reset_index()
# rename the column to be more descriptive
business_sentiment_summary.columns = ['business_id', 'avg_sentiment']

In [24]:
business_sentiment_summary

Unnamed: 0_level_0,business_id,vader_compound
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
0,---kPU91CF4Lq2-WlRu9Lw,0.851179
1,--0iUa4sNDFiZFrAdIWhZQ,0.544421
2,--7PUidqRWpRSpXebiyxTg,0.113567
3,--8IbOsAAxjKRoYsBFL-PA,0.261070
4,--MbOh2O1pATkXa7xbU6LA,0.844852
...,...,...
59243,zzjFdJwXuxBOGe9JeY_EMw,0.737233
59244,zznJox6-nmXlGYNWgTDwQQ,-0.177350
59245,zznZqH9CiAznbkV6fXyHWA,0.908475
59246,zzu6_r3DxBJuXcjnOYVdTw,0.911538


In [42]:
# len(rdf)
rdf.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'num_tips', 'num_checkins',
       'has_coffee_or_tea', 'serves_sweets', 'RestaurantsPriceRange2',
       'serves_alcohol', 'DogsAllowed', 'has_european_food', 'has_american',
       'has_asian_food', 'has_seafood', 'has_entertainment',
       'has_hispanic_food', 'has_vegetarian', 'Open24Hours', 'avg_sentiment',
       'local_preference_ratio'],
      dtype='object')

In [51]:
rdf.Open24Hours.head(3)

0    NaN
1    NaN
2    NaN
Name: Open24Hours, dtype: object

#### Impute missing boolean attribute values with False

In [52]:
# update cols by using fillna with False for the bool columna
bool_columns = ['has_coffee_or_tea', 'serves_sweets',
       'serves_alcohol', 'DogsAllowed', 'has_european_food', 'has_american',
       'has_asian_food', 'has_seafood', 'has_entertainment',
       'has_hispanic_food', 'has_vegetarian', 'Open24Hours']

for col in bool_columns:
    if col in rdf.columns:
        rdf[col] = rdf[col].fillna(False)
    else:
        print(col, " not in rdf columns")


  rdf[col] = rdf[col].fillna(False)


In [53]:
# update na in price range columns, lets take a look at current price range values
rdf.RestaurantsPriceRange2.value_counts(normalize=True)
# this justifies us imputing nans with 2 for price range

RestaurantsPriceRange2
2       0.519626
1       0.437688
3       0.038115
4       0.004251
None    0.000319
Name: proportion, dtype: float64

#### Impute missing price range with median value of 2

In [54]:
rdf['RestaurantsPriceRange2'] = rdf['RestaurantsPriceRange2'].fillna(2)

#### Calculate local preference ratio for restaurants

In [36]:
# use the reviews from review classification file to get local preference ratio
# local preference ratio = num reviews from users in same city as restaurant / total num reviews for restaurant
# create lookup dictionaries once
business_city_map = rdf.set_index('business_id')['city'].to_dict()
user_city_map = {u['user_id']: u.get('city') for u in users}

# add city cols to reviews df
reviews['business_city'] = reviews['business_id'].map(business_city_map)
reviews['user_city'] = reviews['user_id'].map(user_city_map)

# calc local preference ratio
reviews['is_local'] = (reviews['business_city'] == reviews['user_city']).astype(int)

local_pref_ratios = (
    reviews.groupby('business_id')['is_local']
    .agg(['sum', 'count'])
    .assign(local_preference_ratio=lambda x: x['sum'] / x['count'])
    ['local_preference_ratio'].to_dict())

# add to restaurant df
rdf['local_preference_ratio'] = rdf['business_id'].map(local_pref_ratios)

#### Aggregate user sentiment scores and add to user data

In [37]:
# agg mean compound sentiment scores at user level
user_sentiment_summary = reviews.groupby('user_id').agg({'vader_compound': 'mean',}).reset_index()
# Rename the column to be more descriptive
user_sentiment_summary.columns = ['user_id', 'avg_sentiment']
udf = pd.DataFrame(users)

In [38]:
udf = udf.merge(user_sentiment_summary, on='user_id', how='left')

In [31]:
# merge sentiment summary back to restaurant data
# rdf = rdf.merge(business_sentiment_summary, on='business_id', how='left')

#### Gets the number of years theb user was an elite userb and adds elite user ratio to restaurant data

In [None]:
udf['years_elite'] = udf['elite'].apply(lambda x: len(x.split(',')) if isinstance(x, str) and x else 0)

In [71]:
# Create elite user lookup (users who have been elite at least once)
elite_user_ids = set(udf[udf['years_elite'] > 0]['user_id'])

# add is_elite flag to reviews DataFrame
reviews['is_elite'] = reviews['user_id'].isin(elite_user_ids)

print("Retrieved elite reviews...")
# calc elite user ratio using aggregation
elite_user_ratios = (
    reviews.groupby('business_id')['is_elite']
    .agg(['sum', 'count'])
    .assign(elite_user_ratio=lambda x: x['sum'] / x['count'])
    ['elite_user_ratio'].to_dict())
print("Calculated elite user ratios...")

# Add to restaurant dataframe
rdf['elite_user_ratio'] = rdf['business_id'].map(elite_user_ratios).fillna(0)


Retrieved elite reviews...
Calculated elite user ratios...


In [None]:
rdf.columns

In [72]:
rdf.to_csv('../yelp_dataset/yelp_restaurants_filtered_with_new_features.csv', index=False)


### Get's category counts per user from reviews and business categories

This helps create the users food type/restaurant type prefernces

In [57]:
# get user taste profile based on categories of restaurants they reviewed
# use the category mappings from earlier to get counts per category per user and select the top category for the user
# also give a ratio of that category count to total reviews

# Create business_id -> categories mapping once
business_categories = rdf.set_index('business_id')['categories'].to_dict()

# category counts per user
user_category_counts = defaultdict(Counter)

# process categories for all reviews at once
for user_id, business_id in zip(reviews['user_id'], reviews['business_id']):
    cats = business_categories.get(business_id)

    if cats:
        cats = cats.split(', ') if isinstance(cats, str) else cats

        for cat in cats:
            cat_lower = cat.strip().lower()
            # avoid the obvious/top categories

            if cat_lower not in ('food', 'restaurants'):
                user_category_counts[user_id][cat_lower] += 1

#### The output for a single user will look like:
```
{
   'mh_-eMZ6K5RLWhZyISBhwA': Counter({'american (new)': 6, 'bars': 5, 'nightlife': 5, 'pizza': 4, 'breakfast & brunch': 3, 'sandwiches': 3, 'italian': 3, 'mexican':3,
        'juice bars & smoothies': 2, 'coffee & tea': 2, 'diners': 2, 'asian fusion': 2, 'noodles': 2, 'mediterranean': 2, 'fast food': 2, 'chinese': 2, 'japanese': 2,
        'desserts': 2, 'southern': 1, 'american (traditional)': 1, 'donuts': 1, 'pan asian': 1, 'soup': 1, 'wine bars': 1, 'food delivery services': 1, 'dim sum': 1,
        'event planning & services': 1, 'caterers': 1, 'barbeque': 1, 'salad': 1, 'ice cream & frozen yogurt': 1, 'sushi bars': 1, 'latin american': 1, 'sports bars': 1,
        'greek': 1, 'beer': 1, 'wine & spirits': 1, 'indian': 1, 'breweries': 1, 'tex-mex': 1})
}

In [63]:
for u in list(user_category_counts.keys())[:5]:
    print(u, user_category_counts[u])

mh_-eMZ6K5RLWhZyISBhwA Counter({'american (new)': 6, 'bars': 5, 'nightlife': 5, 'pizza': 4, 'breakfast & brunch': 3, 'sandwiches': 3, 'italian': 3, 'mexican': 3, 'juice bars & smoothies': 2, 'coffee & tea': 2, 'diners': 2, 'asian fusion': 2, 'noodles': 2, 'mediterranean': 2, 'fast food': 2, 'chinese': 2, 'japanese': 2, 'desserts': 2, 'southern': 1, 'american (traditional)': 1, 'donuts': 1, 'pan asian': 1, 'soup': 1, 'wine bars': 1, 'food delivery services': 1, 'dim sum': 1, 'event planning & services': 1, 'caterers': 1, 'barbeque': 1, 'salad': 1, 'ice cream & frozen yogurt': 1, 'sushi bars': 1, 'latin american': 1, 'sports bars': 1, 'greek': 1, 'beer': 1, 'wine & spirits': 1, 'indian': 1, 'breweries': 1, 'tex-mex': 1})
8g_iMtfSiwikVnbP2etR0A Counter({'nightlife': 14, 'bars': 13, 'american (new)': 12, 'american (traditional)': 7, 'breakfast & brunch': 6, 'burgers': 6, 'mexican': 5, 'steakhouses': 5, 'pubs': 4, 'local flavor': 4, 'irish': 3, 'barbeque': 3, 'breweries': 3, 'seafood': 3, '

### Create ratios for each key in mappings dict to help build user taste profile


In [65]:
# use mappings from create mappings and the user counted items to create ratios for each key in the mappings dict
# calculate ratios for each mapping category
user_category_ratios = {}
category_mappings = create_common_mappings()
for user_id, category_counter in user_category_counts.items():
    if not category_counter:
        continue

    total_reviews = sum(category_counter.values())
    user_ratios = {}

    # for each mapped category group (e.g., 'has_american', 'has_asian_food')
    for mapping_key, category_list in category_mappings.items():
        # count reviews in this category group
        count = sum(category_counter[cat] for cat in category_list if cat in category_counter)
        ratio = count / total_reviews if total_reviews > 0 else 0
        user_ratios[mapping_key+"_ratio"] = ratio

    user_category_ratios[user_id] = user_ratios

### add to user category ratios to user dataframe


In [66]:
# add to user category ratios to user dataframe
for mapping_key in category_mappings.keys():
    updated_key = mapping_key + "_ratio"
    udf[updated_key] = udf['user_id'].map(lambda uid: user_category_ratios.get(uid, {}).get(updated_key, 0))
udf.head(4)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,has_american_ratio,has_asian_food_ratio,has_hispanic_food_ratio,has_european_food_ratio,has_seafood_ratio,serves_alcohol_ratio,has_vegetarian_ratio,has_entertainment_ratio,has_coffee_or_tea_ratio,serves_sweets_ratio
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,0.1875,0.0,0.03125,0.0,0.03125,0.03125,0.0,0.03125,0.0,0.0
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,0.204724,0.03937,0.0,0.023622,0.03937,0.031496,0.007874,0.055118,0.03937,0.03937
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,0.405405,0.189189,0.0,0.0,0.054054,0.0,0.0,0.027027,0.0,0.054054
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,0.355556,0.0,0.022222,0.044444,0.066667,0.0,0.0,0.0,0.044444,0.044444


#### Adds users favorite category and ratio to user dataframe

In [58]:
# get each user's top category and ratio
user_top_categories = {}

for user_id, category_counter in user_category_counts.items():
    if not category_counter:
        continue

    # get most common category
    top_category, count = category_counter.most_common(1)[0]

    # calculate ratio
    total_reviews = sum(category_counter.values())
    ratio = count / total_reviews if total_reviews > 0 else 0

    user_top_categories[user_id] = {
        'top_category': top_category,
        'count': count,
        'total_reviews': total_reviews,
        'ratio': ratio}


In [59]:
# add to user df
udf['top_category'] = udf['user_id'].map(lambda uid: user_top_categories.get(uid, {}).get('top_category'))
udf['top_category_ratio'] = udf['user_id'].map(lambda uid: user_top_categories.get(uid, {}).get('ratio', 0))

In [60]:
udf.top_category.value_counts().head(20)

top_category
nightlife                 101533
american (traditional)     94232
american (new)             85303
bars                       83856
breakfast & brunch         76929
seafood                    67697
mexican                    60825
sandwiches                 57032
pizza                      54245
italian                    47734
coffee & tea               40280
cajun/creole               33556
burgers                    32492
chinese                    27042
sushi bars                 23178
fast food                  20237
steakhouses                20139
japanese                   19711
cafes                      17546
barbeque                   16654
Name: count, dtype: int64

#### User rating variance calculation

In [69]:
# user rating variance
user_rating_stats = reviews.groupby('user_id')['stars'].agg(['std']).reset_index()
user_rating_stats.columns = ['user_id', 'rating_variance']
udf = udf.merge(user_rating_stats, on='user_id', how='left')

In [68]:
udf.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos', 'base_city', 'inferred_state', 'state_code',
       'avg_sentiment', 'top_category', 'top_category_ratio',
       'has_american_ratio', 'has_asian_food_ratio', 'has_hispanic_food_ratio',
       'has_european_food_ratio', 'has_seafood_ratio', 'serves_alcohol_ratio',
       'has_vegetarian_ratio', 'has_entertainment_ratio',
       'has_coffee_or_tea_ratio', 'serves_sweets_ratio'],
      dtype='object')

Check ratio for tourist vs local resturaunt visits

In [73]:
# calculate local vs tourist ratio for users using reviews
user_local_ratios = (
    reviews.groupby('user_id')['is_local']
    .agg(['sum', 'count'])
    .assign(local_review_ratio=lambda x: x['sum'] / x['count'])
    ['local_review_ratio']
    .to_dict())

udf['local_review_ratio'] = udf['user_id'].map(user_local_ratios).fillna(0)


#### Get user date fwatures

In [74]:
# review recency and frequency
reviews['date'] = pd.to_datetime(reviews['date'])
user_review_activity = reviews.groupby('user_id')['date'].agg([('first_review', 'min'), ('last_review', 'max')]).reset_index()

# create user activity features
user_review_activity['days_since_last_review'] = (reviews.date.max() - user_review_activity['last_review']).dt.days
user_review_activity['days_active'] = (user_review_activity['last_review'] - user_review_activity['first_review']).dt.days
user_review_activity['review_frequency'] = udf.set_index('user_id')['review_count'] / user_review_activity['days_active'].replace(0, 1)


udf = udf.merge(user_review_activity[['user_id', 'days_since_last_review', 'review_frequency']], on='user_id', how='left')

#### Get price preference from business rprice range

In [76]:
# price range preference
user_price_prefs = reviews.merge(
    rdf[['business_id', 'RestaurantsPriceRange2']],
    on='business_id',
    how='left')

# convert to numeric, coercing errors to NaN
user_price_prefs['RestaurantsPriceRange2'] = pd.to_numeric(user_price_prefs['RestaurantsPriceRange2'],errors='coerce').fillna(2)

# calculate mean, ignoring NaN values
user_avg_price = user_price_prefs.groupby('user_id')['RestaurantsPriceRange2'].mean().to_dict()
udf['avg_price_preference'] = udf['user_id'].map(user_avg_price)

In [77]:
# geographic diversity
user_city_diversity = reviews.groupby('user_id')['business_city'].nunique().to_dict()
udf['num_cities_reviewed'] = udf['user_id'].map(user_city_diversity).fillna(1)


In [79]:
rdf.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'num_tips', 'num_checkins',
       'has_coffee_or_tea', 'serves_sweets', 'RestaurantsPriceRange2',
       'serves_alcohol', 'DogsAllowed', 'has_european_food', 'has_american',
       'has_asian_food', 'has_seafood', 'has_entertainment',
       'has_hispanic_food', 'has_vegetarian', 'Open24Hours', 'avg_sentiment',
       'local_preference_ratio', 'elite_user_ratio'],
      dtype='object')

In [80]:
udf.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos', 'base_city', 'inferred_state', 'state_code',
       'avg_sentiment', 'top_category', 'top_category_ratio',
       'has_american_ratio', 'has_asian_food_ratio', 'has_hispanic_food_ratio',
       'has_european_food_ratio', 'has_seafood_ratio', 'serves_alcohol_ratio',
       'has_vegetarian_ratio', 'has_entertainment_ratio',
       'has_coffee_or_tea_ratio', 'serves_sweets_ratio', 'years_elite',
       'rating_variance', 'local_review_ratio', 'days_since_last_review',
       'review_frequency', 'avg_price_preference', 'num_cities_reviewed'],
      dtype='object')

In [78]:
# save user dataframe with new features
udf.to_csv('../yelp_dataset/yelp_users_with_taste_profile.csv', index=False)