In [19]:
import pandas as pd
import pandas as pd
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from fuzzywuzzy import process
# pip install fuzzywuzzy[speedup]
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut


In [20]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

geolocator = Nominatim(user_agent="restaurant-country-finder")

def get_country(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), exactly_one=True, timeout=10)
        if location and 'country' in location.raw['address']:
            return location.raw['address']['country']
        else:
            return None
    except GeocoderTimedOut:
        return None


In [2]:
# Use raw string format for safe Windows path handling
business_path = r"C:\Users\MUIS\OneDrive - Eltronic Group A S\Desktop\ArunProject\yelp_academic_dataset_business.json"
review_path = r"C:\Users\MUIS\OneDrive - Eltronic Group A S\Desktop\ArunProject\yelp_academic_dataset_review.json"

# Step 1: Load only restaurant businesses
print("Loading businesses...")
restaurants = []
with open(business_path, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        if data.get("categories") and "Restaurants" in data["categories"]:
            restaurants.append(data)
restaurants_df = pd.DataFrame(restaurants)
restaurant_ids = set(restaurants_df['business_id'])

print(f"Loaded {len(restaurants_df)} restaurant businesses.")

# Step 2: Load limited reviews for only those businesses (optional: limit for speed)
print("Loading reviews...")
review_chunk = []
with open(review_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i % 100000 == 0 and i > 0:
            print(f"Processed {i} reviews...")
        data = json.loads(line)
        if data['business_id'] in restaurant_ids:
            review_chunk.append(data)
        if len(review_chunk) >= 100000:  # Optional: stop after 100K reviews for now
            break
reviews_df = pd.DataFrame(review_chunk)

print(f"Loaded {len(reviews_df)} reviews related to restaurants.")


Loading businesses...
Loaded 52268 restaurant businesses.
Loading reviews...
Processed 100000 reviews...
Loaded 100000 reviews related to restaurants.


In [3]:
restaurants_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."


In [4]:
reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
4,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31


## Preprocess the Data

In [5]:
restaurants_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52268 entries, 0 to 52267
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   52268 non-null  object 
 1   name          52268 non-null  object 
 2   address       52268 non-null  object 
 3   city          52268 non-null  object 
 4   state         52268 non-null  object 
 5   postal_code   52268 non-null  object 
 6   latitude      52268 non-null  float64
 7   longitude     52268 non-null  float64
 8   stars         52268 non-null  float64
 9   review_count  52268 non-null  int64  
 10  is_open       52268 non-null  int64  
 11  attributes    51703 non-null  object 
 12  categories    52268 non-null  object 
 13  hours         44990 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 5.6+ MB


In [6]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    100000 non-null  object 
 1   user_id      100000 non-null  object 
 2   business_id  100000 non-null  object 
 3   stars        100000 non-null  float64
 4   useful       100000 non-null  int64  
 5   funny        100000 non-null  int64  
 6   cool         100000 non-null  int64  
 7   text         100000 non-null  object 
 8   date         100000 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 6.9+ MB


In [7]:
# Convert review date to datetime
reviews_df['date'] = pd.to_datetime(reviews_df['date'])

# Keep only necessary columns
reviews_df = reviews_df[['user_id', 'business_id', 'stars', 'text', 'date']]

# Merge with business info
merged_df = pd.merge(reviews_df, restaurants_df[['business_id', 'name', 'categories', 'city', 'state','latitude', 'longitude']],
                     on='business_id', how='left')

print(f"Merged data has shape: {merged_df.shape}")


Merged data has shape: (100000, 11)


In [8]:
merged_df.head()

Unnamed: 0,user_id,business_id,stars,text,date,name,categories,city,state,latitude,longitude
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...",North Wales,PA,40.210196,-75.223639
1,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,Kettle Restaurant,"Restaurants, Breakfast & Brunch",Tucson,AZ,32.207233,-110.980864
2,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Zaika,"Halal, Pakistani, Restaurants, Indian",Philadelphia,PA,40.079848,-75.02508
3,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Melt,"Sandwiches, Beer, Wine & Spirits, Bars, Food, ...",New Orleans,LA,29.962102,-90.087958
4,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,Dmitri's,"Mediterranean, Restaurants, Seafood, Greek",Philadelphia,PA,39.938013,-75.148131


In [9]:
# Create metadata DataFrame (removing duplicates on name)
restaurant_metadata = merged_df[['name', 'city', 'state', 'latitude', 'longitude']].drop_duplicates().set_index('name')

In [10]:
restaurant_metadata.to_pickle("restaurant_metadata.pkl")

In [11]:
merged_df["stars"].unique()

array([3., 5., 4., 1., 2.])

In [12]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   user_id      100000 non-null  object        
 1   business_id  100000 non-null  object        
 2   stars        100000 non-null  float64       
 3   text         100000 non-null  object        
 4   date         100000 non-null  datetime64[ns]
 5   name         100000 non-null  object        
 6   categories   100000 non-null  object        
 7   city         100000 non-null  object        
 8   state        100000 non-null  object        
 9   latitude     100000 non-null  float64       
 10  longitude    100000 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(7)
memory usage: 8.4+ MB


## Create User-Item Rating Matrix

This matrix is the foundation of collaborative filtering methods:

- User-Based: Recommend items that similar users liked.

- Item-Based: Recommend items similar to what the user liked.

- Matrix Factorization (SVD, ALS, etc.): Learn latent factors from the matrix.

In [13]:
# Step 4: Create user-item matrix
user_item_matrix = merged_df.pivot_table(index='user_id', columns='name', values='stars')

# Optionally fill NaNs with 0 (or keep as is for implicit feedback models)
user_item_matrix_filled = user_item_matrix.fillna(0)


- index='user_id': Rows are users.

- columns='name': Columns are restaurant names.

- values='stars': Values are the star ratings given by users to restaurants.

- NaN means a user has not rated a restaurant.

- Filling with 0 helps some algorithms (like matrix factorization) treat it as "no interaction".

In [14]:
user_item_matrix_filled

name,'feine,1 Stop Pizza,101 Taiwanese Cuisine,10th Street Italian,1200 Chophouse,12th & Porter,16th Street Seafood,1911 Smoke House Barbeque,1925 Cocktail Lounge,2 in One Cafe,...,eegee's,fat Rooster diner,honeygrow,iCafe,iLuv Pho,iPho Vietnamese Restaurant,il Tavolo Trattoria,la Madeleine,swah-rey,sweetgreen
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---2PmXbF47D870stH1jqA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--4AjktZiHowEIBCMd4CZA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--E0uVPphTORm_OiZ5KCvA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--KMTwCrhKKUmr7riuS4WQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--S8M395r8NtOCvS2LRfDw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzgMuJgxmToqcJ5iu1TngQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzjThJ4A1m1N78gquSR_QA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzsPf8xNW11nd0B6MZqfRw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzsqjDvanJhH9tn8NautOQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
user_item_matrix_filled["'feine"].unique()


array([0., 3., 5., 2., 4., 1.])

## Apply Truncated SVD to Learn Latent Features

Build Item-Based Collaborative Filtering

In [114]:
# Fit TruncatedSVD on the user-item matrix
# Create a Truncated SVD model to reduce matrix to 20 latent features.

svd = TruncatedSVD(n_components=20, random_state=42)

# item_matrix: now contains latent features for each restaurant.
item_matrix = svd.fit_transform(user_item_matrix_filled.T)

##  Calculate Restaurant Similarity

- Compute cosine similarity between every pair of restaurants based on their SVD embeddings.
- Create a DataFrame from the similarity matrix with restaurant names as row/column labels for easy lookup.


In [115]:
# Compute similarity matrix (cosine between restaurant vectors)

similarity_matrix = cosine_similarity(item_matrix)

# Wrap it in a DataFrame for easy indexing
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=user_item_matrix_filled.columns, 
                             columns=user_item_matrix_filled.columns)

print("TruncatedSVD model fitted and similarity matrix created.")


TruncatedSVD model fitted and similarity matrix created.


In [116]:
similarity_df.head()

name,'feine,1 Stop Pizza,101 Taiwanese Cuisine,10th Street Italian,1200 Chophouse,12th & Porter,16th Street Seafood,1911 Smoke House Barbeque,1925 Cocktail Lounge,2 in One Cafe,...,eegee's,fat Rooster diner,honeygrow,iCafe,iLuv Pho,iPho Vietnamese Restaurant,il Tavolo Trattoria,la Madeleine,swah-rey,sweetgreen
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'feine,1.0,-0.105899,-0.102117,0.314479,0.029942,0.070527,0.70312,0.023892,0.282228,-0.493109,...,-0.066685,0.177817,0.609872,-0.138705,-0.049444,0.332539,0.181136,0.067214,-0.082262,0.82998
1 Stop Pizza,-0.105899,1.0,0.121976,0.171001,0.174774,0.042794,0.279271,0.440858,-0.067523,0.296214,...,0.07862,-0.088877,-0.012803,0.269335,0.13452,-0.06763,-0.558961,-0.178176,0.114041,0.056498
101 Taiwanese Cuisine,-0.102117,0.121976,1.0,-0.108208,0.048057,0.001426,0.256666,0.1348,0.754023,0.509246,...,0.282207,0.780549,0.373027,0.029414,0.178012,0.671654,0.080501,-0.135656,0.089244,-0.017891
10th Street Italian,0.314479,0.171001,-0.108208,1.0,0.144145,0.002137,0.439883,-0.007128,0.000721,0.048755,...,0.10636,-0.059514,0.053122,0.180788,0.077383,-0.031836,0.133137,-0.042852,-0.008505,0.1974
1200 Chophouse,0.029942,0.174774,0.048057,0.144145,1.0,-0.021517,0.070275,0.253191,0.000823,-0.005873,...,0.027193,-0.001286,-0.028203,-0.094919,0.050945,0.020877,-0.057879,0.009389,0.981152,0.020916


In [117]:
similarity_df.to_pickle("similarity_matrix.pkl")

## Define Recommendation Function

In [118]:
def get_best_match(query, choices, threshold=70):
    match, score = process.extractOne(query, choices)
    return match if score >= threshold else None

In [119]:
def recommend_similar_restaurants_svd(target_restaurant, similarity_df, metadata_df, n=5, save_csv=False):
    # Fuzzy match
    best_match = get_best_match(target_restaurant, similarity_df.index)
    if not best_match:
        print(f"Restaurant '{target_restaurant}' not found (even after fuzzy match).")
        return pd.DataFrame()

    # Get similarity scores and top N
    similar_scores = similarity_df[best_match].drop(best_match)
    top_similar = similar_scores.sort_values(ascending=False).head(n)
    similarity_percent = (top_similar.values * 100).round(1)

    # Get city/state info
    meta_info = metadata_df.loc[top_similar.index]
    
    result_df = pd.DataFrame({
        "Similar Restaurant": top_similar.index,
        "City": meta_info['city'].values,
        "State": meta_info['state'].values,
        "Similarity Score (%)": similarity_percent
    })

    print(f"\nTop {n} restaurants similar to '{best_match}':")

    if save_csv:
        result_df.to_csv(f"top_{n}_similar_to_{best_match}.csv", index=False)
        print(f"Saved recommendations to 'top_{n}_similar_to_{best_match}.csv'")

    return result_df


## Recommend Similar Restaurants

In [120]:
recommend_similar_restaurants_svd("1 Stop Piza", similarity_df, restaurant_metadata, n=5, save_csv=True)


Top 5 restaurants similar to '1 Stop Pizza':
Saved recommendations to 'top_5_similar_to_1 Stop Pizza.csv'


Unnamed: 0,Similar Restaurant,City,State,Similarity Score (%)
0,Meson 923,New Orleans,LA,92.0
1,Twisted Roots,St. Louis,MO,88.8
2,Brewsters Brewing Company & Restaurant,Edmonton,AB,88.6
3,Taqueria El Sitio,Hermitage,TN,85.4
4,AeroCaffe´,Boise,ID,84.5
