In [1]:
import os
from dotenv import load_dotenv
import requests
import json
import pandas as pd


In [2]:
load_dotenv()
YELP_API_KEY = os.getenv('YELP_API_KEY')


In [3]:
if not YELP_API_KEY:
    print("Error: YELP_API_KEY not found in the .env file. Please ensure it's configured correctly.")
    exit()
else:
    print("Yelp API Key loaded successfully.")


Yelp API Key loaded successfully.


In [4]:
YELP_API_BASE_URL = "https://api.yelp.com/v3"

In [5]:
headers = {
    "Authorization": f"Bearer {YELP_API_KEY}"
}

In [26]:

latitude = 33.9789
longitude =  -117.32813
radius = 16093

In [22]:
search_params = {
    "latitude": latitude,
    "longitude": longitude,
    "radius": 16093,  # Search radius in meters (approximately 10 miles)
    "categories": "restaurants",
}


In [28]:

def search_all_restaurants(lat, lon, radius, categories, limit=50):
    """
    Searches for all restaurants within the given parameters, handling pagination.
    """
    businesses = []
    offset = 0
    total_fetched = 0
    max_possible = 1000  # Yelp API limit for total results in a search

    print("\nFetching all possible restaurants...")

    while total_fetched < max_possible:
        search_params = {
            "latitude": lat,
            "longitude": lon,
            "radius": radius,
            "categories": categories,
            "limit": limit,
            "offset": offset
        }
        businesses_endpoint = f"{YELP_API_BASE_URL}/businesses/search"

        try:
            response = requests.get(businesses_endpoint, headers=headers, params=search_params)
            response.raise_for_status()
            search_results = response.json()

            if search_results and 'businesses' in search_results:
                current_businesses = search_results['businesses']
                if not current_businesses:
                    print("No more restaurants found.")
                    break
                businesses.extend(current_businesses)
                total_fetched += len(current_businesses)
                print(f"Fetched {total_fetched} restaurants so far.")
                offset += limit
                if 'total' in search_results and total_fetched >= search_results['total']:
                    print("Reached total number of restaurants reported by Yelp.")
                    break
            else:
                print("Error or no businesses found in this batch.")
                break

            # Be mindful of rate limits - you might need to add a small delay here
            # time.sleep(0.1)

        except requests.exceptions.RequestException as e:
            print(f"Error during Yelp Businesses Search API request (offset {offset}): {e}")
            break

        if total_fetched >= 1000: # Hard stop at Yelp's reported max
            print("Reached Yelp's maximum of 1000 results per search.")
            break

    return businesses

In [29]:
all_restaurants_data = search_all_restaurants(latitude, longitude, radius, "restaurants")


Fetching all possible restaurants...
Fetched 50 restaurants so far.
Fetched 100 restaurants so far.
Fetched 150 restaurants so far.
Fetched 200 restaurants so far.
Error during Yelp Businesses Search API request (offset 200): 400 Client Error: Bad Request for url: https://api.yelp.com/v3/businesses/search?latitude=33.9789&longitude=-117.32813&radius=16093&categories=restaurants&limit=50&offset=200


In [31]:
len(all_restaurants_data)

200

In [34]:
if all_restaurants_data:
    restaurants_df = pd.DataFrame(all_restaurants_data)
    print("\nFirst 5 rows of the initial restaurants DataFrame:")
    print(restaurants_df.head()) 
    
else:
    restaurants_df = pd.DataFrame()


First 5 rows of the initial restaurants DataFrame:
                       id                                      alias  \
0  7XJsjflQAnQTzvQYzAwydA   smoke-and-fire-social-eatery-riverside-2   
1  CG0qu39ItRcAd8J5liITYQ                       elias-pita-riverside   
2  ElH8ZbPSG1rIFA3wXyFn5Q                   the-salted-pig-riverside   
3  KIVSXV1MrwIUO8E02M0H6Q                craving-crab-haus-riverside   
4  wsgdv_edV5bvssYk9pyOIg  georgie-s-mediterranean-cuisine-riverside   

                              name  \
0       Smoke & Fire Social Eatery   
1                       Elias Pita   
2                   The Salted Pig   
3                Craving Crab Haus   
4  Georgie’s Mediterranean Cuisine   

                                           image_url  is_closed  \
0  https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...      False   
1  https://s3-media2.fl.yelpcdn.com/bphoto/_n6TU4...      False   
2  https://s3-media3.fl.yelpcdn.com/bphoto/G65RHz...      False   
3  https://s3-medi

In [36]:
len(restaurants_df)

200

In [41]:
def get_all_restaurant_reviews(business_id, max_reviews = 35):
    """
    Fetches all reviews for a given Yelp business ID, handling pagination.
    """
    reviews = []
    offset = 0
    total_fetched = 0
    limit_per_call = 7
    print(f"Fetching up to {max_reviews} reviews for business ID: {business_id}")
    while total_fetched < max_reviews:
        reviews_endpoint = f"{YELP_API_BASE_URL}/businesses/{business_id}/reviews"
        params = {"offset": offset}
        try:
            response = requests.get(reviews_endpoint, headers=headers, params=params)
            response.raise_for_status()
            reviews_data = response.json()
            if reviews_data and 'reviews' in reviews_data:
                current_reviews = reviews_data['reviews']
                if not current_reviews:
                    print("No more reviews found for this restaurant.")
                    break
                reviews.extend(current_reviews)
                total_fetched += len(current_reviews)
                offset += limit_per_call
                print(f"Fetched {len(reviews)} reviews so far for this restaurant.")
                if len(current_reviews) < limit_per_call:
                    print("Reached end of reviews for this restaurant.")
                    break

            else:
                print("Error or no reviews found in this batch for this restaurant.")
                break
            # Be mindful of rate limits - you might need a small delay here
            # time.sleep(0.1)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching reviews for business ID {business_id} (offset {offset}): {e}")
            break
    return reviews

In [42]:
MAX_REVIEWS_PER_RESTAURANT = 35

In [43]:
all_restaurant_reviews = []
api_calls_used = 0

if not restaurants_df.empty:
    print(f"\nFetching up to {MAX_REVIEWS_PER_RESTAURANT} reviews for each restaurant (up to API call limit)...")
    for index, row in restaurants_df.iterrows():
        if api_calls_used >= 3900:  # Leave some buffer
            print("Approaching API call limit. Stopping review fetching.")
            break

        business_id = row['id']
        reviews = get_all_restaurant_reviews(business_id, max_reviews=MAX_REVIEWS_PER_RESTAURANT)
        api_calls_used += (len(reviews) // 3) + (1 if len(reviews) % 3 > 0 else 0) # Estimate API calls
        # A more precise way would be to increment the counter inside get_all_restaurant_reviews

        for review in reviews:
            review['business_id'] = business_id
            all_restaurant_reviews.append(review)

        print(f"Finished fetching reviews for restaurant ID: {business_id}. Estimated total API calls used: {api_calls_used}")

    if all_restaurant_reviews:
        reviews_df = pd.DataFrame(all_restaurant_reviews)
        print(f"\nSuccessfully fetched a total of {len(reviews_df)} reviews.")
        print("\nFirst 5 rows of the reviews DataFrame:")
        print(reviews_df.head())
    else:
        reviews_df = pd.DataFrame()
        print("No reviews fetched for the found restaurants.")
else:
    reviews_df = pd.DataFrame()
    print("No restaurants available to fetch reviews for.")


Fetching up to 35 reviews for each restaurant (up to API call limit)...
Fetching up to 35 reviews for business ID: 7XJsjflQAnQTzvQYzAwydA
Fetched 3 reviews so far for this restaurant.
Reached end of reviews for this restaurant.
Finished fetching reviews for restaurant ID: 7XJsjflQAnQTzvQYzAwydA. Estimated total API calls used: 1
Fetching up to 35 reviews for business ID: CG0qu39ItRcAd8J5liITYQ
Fetched 3 reviews so far for this restaurant.
Reached end of reviews for this restaurant.
Finished fetching reviews for restaurant ID: CG0qu39ItRcAd8J5liITYQ. Estimated total API calls used: 2
Fetching up to 35 reviews for business ID: ElH8ZbPSG1rIFA3wXyFn5Q
Fetched 3 reviews so far for this restaurant.
Reached end of reviews for this restaurant.
Finished fetching reviews for restaurant ID: ElH8ZbPSG1rIFA3wXyFn5Q. Estimated total API calls used: 3
Fetching up to 35 reviews for business ID: KIVSXV1MrwIUO8E02M0H6Q
Fetched 3 reviews so far for this restaurant.
Reached end of reviews for this restau

In [48]:
print("Columns in restaurants_df:")
print(restaurants_df.columns)
print("\nColumns in restaurant_reviews_df:")
print(restaurant_reviews_df.columns)


Columns in restaurants_df:
Index(['id', 'alias', 'name', 'image_url', 'is_closed', 'url', 'review_count',
       'categories', 'rating', 'coordinates', 'transactions', 'price',
       'location', 'phone', 'display_phone', 'distance', 'business_hours',
       'attributes'],
      dtype='object')

Columns in restaurant_reviews_df:
Index(['id_x', 'alias', 'name', 'image_url', 'is_closed', 'url_x',
       'review_count', 'categories', 'rating_x', 'coordinates', 'transactions',
       'price', 'location', 'phone', 'display_phone', 'distance',
       'business_hours', 'attributes', 'id_y', 'url_y', 'text', 'rating_y',
       'time_created', 'user', 'business_id'],
      dtype='object')


In [45]:
reviews_df.head()

Unnamed: 0,id,url,text,rating,time_created,user,business_id
0,e84DJb5bqU63D-7cObU4pQ,https://www.yelp.com/biz/smoke-and-fire-social...,"Food was good, service was great, Alexa was ve...",4,2025-04-05 12:27:18,"{'id': 'KcxN-hBjh7OsdQJh_0xvWA', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
1,rWQ-KpfjLDTBes6LarDuhg,https://www.yelp.com/biz/smoke-and-fire-social...,Absolutely love the atmosphere and the food is...,5,2025-03-23 21:16:35,"{'id': 'DJDC7kwj4qItmsi5BXWC8A', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
2,O3Rnl2Ew8uulDA_TBPTDtA,https://www.yelp.com/biz/smoke-and-fire-social...,They ran out of fries and coleslaw and still c...,3,2025-03-03 03:39:00,"{'id': 'S3gYxVAwJcGy76e3gdJZBg', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
3,UV3u1IbW-q38XWgTZgRdnQ,https://www.yelp.com/biz/elias-pita-riverside?...,"Excellent + food was delicious, great portion,...",5,2025-04-06 11:01:58,"{'id': 'kQKosEedp_QFSZS7wyB3Og', 'profile_url'...",CG0qu39ItRcAd8J5liITYQ
4,43QScRvU-r0NLjt-dzG5WQ,https://www.yelp.com/biz/elias-pita-riverside?...,The food was amazing! We had the gyro's and hu...,5,2024-11-08 15:17:32,"{'id': 'ORK17DiBQHAmLYdEjzsnJQ', 'profile_url'...",CG0qu39ItRcAd8J5liITYQ


In [46]:
if not restaurants_df.empty and not reviews_df.empty:
    restaurant_reviews_df = pd.merge(restaurants_df, reviews_df, left_on='id', right_on='business_id', how='inner')
    print(f"\nMerged DataFrame shape: {restaurant_reviews_df.shape}")
    print("\nFirst 5 rows of the merged DataFrame:")
    print(restaurant_reviews_df.head())
else:
    restaurant_reviews_df = pd.DataFrame()
    print("\nCould not merge restaurant and review data as one or both DataFrames are empty.")


Merged DataFrame shape: (600, 25)

First 5 rows of the merged DataFrame:
                     id_x                                     alias  \
0  7XJsjflQAnQTzvQYzAwydA  smoke-and-fire-social-eatery-riverside-2   
1  7XJsjflQAnQTzvQYzAwydA  smoke-and-fire-social-eatery-riverside-2   
2  7XJsjflQAnQTzvQYzAwydA  smoke-and-fire-social-eatery-riverside-2   
3  CG0qu39ItRcAd8J5liITYQ                      elias-pita-riverside   
4  CG0qu39ItRcAd8J5liITYQ                      elias-pita-riverside   

                         name  \
0  Smoke & Fire Social Eatery   
1  Smoke & Fire Social Eatery   
2  Smoke & Fire Social Eatery   
3                  Elias Pita   
4                  Elias Pita   

                                           image_url  is_closed  \
0  https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...      False   
1  https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...      False   
2  https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...      False   
3  https://s3-media2.fl.yelpcdn.

In [49]:
if not restaurant_reviews_df.empty:
    restaurant_reviews_final_df = restaurant_reviews_df[[
        'id_x',  # Business ID
        'name',
        'categories',  # Restaurant Categories
        'location',
        'coordinates',
        'rating_x',  # Average Rating
        'id_y',  # Review ID
        'user',
        'rating_y',  # Review Rating
        'text',  # Review Text
        'time_created'
    ]].copy()
    restaurant_reviews_final_df.rename(columns={
        'id_x': 'business_id',
        'categories': 'categories',
        'rating_x': 'avg_rating',
        'rating_y': 'review_rating',
        'text': 'review_text',
        'time_created': 'review_date',
        'id_y': 'review_id'
    }, inplace=True)
    print("\nFirst 5 rows of the final combined DataFrame:")
    print(restaurant_reviews_final_df.head())
else:
    print("\nFinal combined DataFrame is empty.")




First 5 rows of the final combined DataFrame:
              business_id                        name  \
0  7XJsjflQAnQTzvQYzAwydA  Smoke & Fire Social Eatery   
1  7XJsjflQAnQTzvQYzAwydA  Smoke & Fire Social Eatery   
2  7XJsjflQAnQTzvQYzAwydA  Smoke & Fire Social Eatery   
3  CG0qu39ItRcAd8J5liITYQ                  Elias Pita   
4  CG0qu39ItRcAd8J5liITYQ                  Elias Pita   

                                          categories  \
0  [{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...   
1  [{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...   
2  [{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...   
3  [{'alias': 'mediterranean', 'title': 'Mediterr...   
4  [{'alias': 'mediterranean', 'title': 'Mediterr...   

                                            location  \
0  {'address1': '5225 Canyon Crest Dr', 'address2...   
1  {'address1': '5225 Canyon Crest Dr', 'address2...   
2  {'address1': '5225 Canyon Crest Dr', 'address2...   
3  {'address1': '1490 University Ave', 'address2'

In [50]:
restaurant_reviews_final_df.head()

Unnamed: 0,business_id,name,categories,location,coordinates,avg_rating,review_id,user,review_rating,review_text,review_date
0,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...","{'address1': '5225 Canyon Crest Dr', 'address2...","{'latitude': 33.955971, 'longitude': -117.330408}",4.5,e84DJb5bqU63D-7cObU4pQ,"{'id': 'KcxN-hBjh7OsdQJh_0xvWA', 'profile_url'...",4,"Food was good, service was great, Alexa was ve...",2025-04-05 12:27:18
1,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...","{'address1': '5225 Canyon Crest Dr', 'address2...","{'latitude': 33.955971, 'longitude': -117.330408}",4.5,rWQ-KpfjLDTBes6LarDuhg,"{'id': 'DJDC7kwj4qItmsi5BXWC8A', 'profile_url'...",5,Absolutely love the atmosphere and the food is...,2025-03-23 21:16:35
2,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...","{'address1': '5225 Canyon Crest Dr', 'address2...","{'latitude': 33.955971, 'longitude': -117.330408}",4.5,O3Rnl2Ew8uulDA_TBPTDtA,"{'id': 'S3gYxVAwJcGy76e3gdJZBg', 'profile_url'...",3,They ran out of fries and coleslaw and still c...,2025-03-03 03:39:00
3,CG0qu39ItRcAd8J5liITYQ,Elias Pita,"[{'alias': 'mediterranean', 'title': 'Mediterr...","{'address1': '1490 University Ave', 'address2'...","{'latitude': 33.97533, 'longitude': -117.34385}",4.3,UV3u1IbW-q38XWgTZgRdnQ,"{'id': 'kQKosEedp_QFSZS7wyB3Og', 'profile_url'...",5,"Excellent + food was delicious, great portion,...",2025-04-06 11:01:58
4,CG0qu39ItRcAd8J5liITYQ,Elias Pita,"[{'alias': 'mediterranean', 'title': 'Mediterr...","{'address1': '1490 University Ave', 'address2'...","{'latitude': 33.97533, 'longitude': -117.34385}",4.3,43QScRvU-r0NLjt-dzG5WQ,"{'id': 'ORK17DiBQHAmLYdEjzsnJQ', 'profile_url'...",5,The food was amazing! We had the gyro's and hu...,2024-11-08 15:17:32


In [56]:
#all_restaurants_data = search_all_restaurants(latitude, longitude, radius, "restaurants")

if all_restaurants_data:
    restaurants_df = pd.DataFrame(all_restaurants_data)
    print(f"\nSuccessfully fetched a total of {len(restaurants_df)} restaurants.")
    print("\nFirst 5 rows of the restaurants DataFrame (initial):")
    print(restaurants_df.head())

    # Add a column to store the description
    restaurants_df['description'] = None

    print("\nFetching business details (including description) for each restaurant...")
    for index, row in restaurants_df.iterrows():
        business_id = row['id']
        business_details_endpoint = f"{YELP_API_BASE_URL}/businesses/{business_id}"
        try:
            response = requests.get(business_details_endpoint, headers=headers)
            response.raise_for_status()
            business_details = response.json()
            print(f"\nFull Business Details Response for {business_id}:")
            print(json.dumps(business_details, indent=4)) # Print the full JSON with indentation

            if 'snippet' in business_details:
                restaurants_df.loc[index, 'description'] = business_details['snippet']
            elif 'name' in business_details: # Fallback to name if no snippet
                restaurants_df.loc[index, 'description'] = f"Description not available. Showing name: {business_details['name']}"
            else:
                restaurants_df.loc[index, 'description'] = "Description not available."
            print(f"Description: {restaurants_df.loc[index, 'description']}")

            # Be mindful of API call limits

        except requests.exceptions.RequestException as e:
            print(f"Error fetching details for business ID {business_id}: {e}")
            restaurants_df.loc[index, 'description'] = "Error fetching description."

    print("\nFirst 5 rows of the restaurants DataFrame (with description):")
    print(restaurants_df[['id', 'name', 'description']].head())

else:
    restaurants_df = pd.DataFrame()
    print("\nNo restaurants fetched.")



Successfully fetched a total of 200 restaurants.

First 5 rows of the restaurants DataFrame (initial):
                       id                                      alias  \
0  7XJsjflQAnQTzvQYzAwydA   smoke-and-fire-social-eatery-riverside-2   
1  CG0qu39ItRcAd8J5liITYQ                       elias-pita-riverside   
2  ElH8ZbPSG1rIFA3wXyFn5Q                   the-salted-pig-riverside   
3  KIVSXV1MrwIUO8E02M0H6Q                craving-crab-haus-riverside   
4  wsgdv_edV5bvssYk9pyOIg  georgie-s-mediterranean-cuisine-riverside   

                              name  \
0       Smoke & Fire Social Eatery   
1                       Elias Pita   
2                   The Salted Pig   
3                Craving Crab Haus   
4  Georgie’s Mediterranean Cuisine   

                                           image_url  is_closed  \
0  https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...      False   
1  https://s3-media2.fl.yelpcdn.com/bphoto/_n6TU4...      False   
2  https://s3-media3.fl.yelpcdn.c

In [55]:
restaurants_df.head(20)

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance,business_hours,attributes,description
0,7XJsjflQAnQTzvQYzAwydA,smoke-and-fire-social-eatery-riverside-2,Smoke & Fire Social Eatery,https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...,False,https://www.yelp.com/biz/smoke-and-fire-social...,2672,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...",4.5,"{'latitude': 33.955971, 'longitude': -117.330408}",[],$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19517427309,(951) 742-7309,2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",Description not available. Showing name: Smoke...
1,CG0qu39ItRcAd8J5liITYQ,elias-pita-riverside,Elias Pita,https://s3-media2.fl.yelpcdn.com/bphoto/_n6TU4...,False,https://www.yelp.com/biz/elias-pita-riverside?...,1160,"[{'alias': 'mediterranean', 'title': 'Mediterr...",4.3,"{'latitude': 33.97533, 'longitude': -117.34385}","[delivery, pickup]",$$,"{'address1': '1490 University Ave', 'address2'...",19516866800,(951) 686-6800,1509.822879,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",
2,ElH8ZbPSG1rIFA3wXyFn5Q,the-salted-pig-riverside,The Salted Pig,https://s3-media3.fl.yelpcdn.com/bphoto/G65RHz...,False,https://www.yelp.com/biz/the-salted-pig-rivers...,3349,"[{'alias': 'gastropubs', 'title': 'Gastropubs'...",3.8,"{'latitude': 33.98226, 'longitude': -117.37386}",[delivery],$$,"{'address1': '3750 Main St', 'address2': 'Ste ...",19517425664,(951) 742-5664,4243.221458,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",
3,KIVSXV1MrwIUO8E02M0H6Q,craving-crab-haus-riverside,Craving Crab Haus,https://s3-media2.fl.yelpcdn.com/bphoto/blk-Vh...,False,https://www.yelp.com/biz/craving-crab-haus-riv...,427,"[{'alias': 'cajun', 'title': 'Cajun/Creole'}, ...",4.5,"{'latitude': 33.97674, 'longitude': -117.33752}",[restaurant_reservation],$$,"{'address1': '1201 University Ave', 'address2'...",19517425859,(951) 742-5859,904.674919,"[{'open': [{'is_overnight': False, 'start': '1...","{'has_pool_table': False, 'business_url': 'htt...",
4,wsgdv_edV5bvssYk9pyOIg,georgie-s-mediterranean-cuisine-riverside,Georgie’s Mediterranean Cuisine,https://s3-media3.fl.yelpcdn.com/bphoto/3WhUY5...,False,https://www.yelp.com/biz/georgie-s-mediterrane...,426,"[{'alias': 'mediterranean', 'title': 'Mediterr...",4.5,"{'latitude': 33.95623720813322, 'longitude': -...","[delivery, pickup]",$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19518230440,(951) 823-0440,2547.020308,"[{'open': [{'is_overnight': False, 'start': '1...",{'business_url': 'https://georgiesmediterranea...,
5,sS_qTKCEmjpUnMiToaE5_A,habanero-mexican-grill-riverside,Habanero Mexican Grill,https://s3-media4.fl.yelpcdn.com/bphoto/a3PMiy...,False,https://www.yelp.com/biz/habanero-mexican-gril...,1075,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.6,"{'latitude': 33.9755435848719, 'longitude': -1...",[delivery],$$,"{'address1': '2472 University Ave', 'address2'...",19512249145,(951) 224-9145,3088.029625,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",
6,XdEtxpf1k7KsqMf8FaJmXA,kimchichanga-riverside-3,Kimchichanga,https://s3-media2.fl.yelpcdn.com/bphoto/JqQyC3...,False,https://www.yelp.com/biz/kimchichanga-riversid...,1141,"[{'alias': 'korean', 'title': 'Korean'}, {'ali...",4.1,"{'latitude': 33.9760835, 'longitude': -117.338...","[delivery, pickup]",$$,"{'address1': '1223 University Ave', 'address2'...",19516849800,(951) 684-9800,983.066517,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",
7,6gIcfFdEAz--Ugc2ctTK-w,big-sky-bistro-riverside-3,Big Sky Bistro,https://s3-media2.fl.yelpcdn.com/bphoto/sw88RJ...,False,https://www.yelp.com/biz/big-sky-bistro-rivers...,621,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.1,"{'latitude': 33.9759522815083, 'longitude': -1...",[pickup],$$,"{'address1': '1575 University Ave', 'address2'...",19513281688,(951) 328-1688,1673.458963,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_apple_pay': True, 'business...",
8,apibcypiWTD4G8ZooNA8Cg,palenque-riverside,Palenque,https://s3-media2.fl.yelpcdn.com/bphoto/0QIupR...,False,https://www.yelp.com/biz/palenque-riverside?ad...,1019,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.4,"{'latitude': 33.982619, 'longitude': -117.37445}",[],$$,"{'address1': '3737 Main St', 'address2': None,...",19518882240,(951) 888-2240,4289.09225,"[{'open': [{'is_overnight': False, 'start': '1...",{'business_url': 'http://www.palenquekitchen.c...,
9,CfKC29D6iTs1mYMk0f_KkQ,pepitos-mexican-restaurant-riverside-3,Pepitos Mexican restaurant,https://s3-media3.fl.yelpcdn.com/bphoto/C3m3tA...,False,https://www.yelp.com/biz/pepitos-mexican-resta...,813,"[{'alias': 'mexican', 'title': 'Mexican'}]",3.9,"{'latitude': 33.9564945825428, 'longitude': -1...",[delivery],$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19517839444,(951) 783-9444,2516.558119,"[{'open': [{'is_overnight': False, 'start': '1...",{'business_url': 'https://www.pepitosfood.com'...,


In [57]:
restaurant_reviews_final_df.head()

Unnamed: 0,business_id,name,categories,location,coordinates,avg_rating,review_id,user,review_rating,review_text,review_date
0,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...","{'address1': '5225 Canyon Crest Dr', 'address2...","{'latitude': 33.955971, 'longitude': -117.330408}",4.5,e84DJb5bqU63D-7cObU4pQ,"{'id': 'KcxN-hBjh7OsdQJh_0xvWA', 'profile_url'...",4,"Food was good, service was great, Alexa was ve...",2025-04-05 12:27:18
1,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...","{'address1': '5225 Canyon Crest Dr', 'address2...","{'latitude': 33.955971, 'longitude': -117.330408}",4.5,rWQ-KpfjLDTBes6LarDuhg,"{'id': 'DJDC7kwj4qItmsi5BXWC8A', 'profile_url'...",5,Absolutely love the atmosphere and the food is...,2025-03-23 21:16:35
2,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...","{'address1': '5225 Canyon Crest Dr', 'address2...","{'latitude': 33.955971, 'longitude': -117.330408}",4.5,O3Rnl2Ew8uulDA_TBPTDtA,"{'id': 'S3gYxVAwJcGy76e3gdJZBg', 'profile_url'...",3,They ran out of fries and coleslaw and still c...,2025-03-03 03:39:00
3,CG0qu39ItRcAd8J5liITYQ,Elias Pita,"[{'alias': 'mediterranean', 'title': 'Mediterr...","{'address1': '1490 University Ave', 'address2'...","{'latitude': 33.97533, 'longitude': -117.34385}",4.3,UV3u1IbW-q38XWgTZgRdnQ,"{'id': 'kQKosEedp_QFSZS7wyB3Og', 'profile_url'...",5,"Excellent + food was delicious, great portion,...",2025-04-06 11:01:58
4,CG0qu39ItRcAd8J5liITYQ,Elias Pita,"[{'alias': 'mediterranean', 'title': 'Mediterr...","{'address1': '1490 University Ave', 'address2'...","{'latitude': 33.97533, 'longitude': -117.34385}",4.3,43QScRvU-r0NLjt-dzG5WQ,"{'id': 'ORK17DiBQHAmLYdEjzsnJQ', 'profile_url'...",5,The food was amazing! We had the gyro's and hu...,2024-11-08 15:17:32


In [58]:
if not restaurant_reviews_df.empty:
    restaurant_reviews_final_df = pd.merge(restaurants_df[['id', 'description', 'attributes', 'transactions', 'distance', 'business_hours']], restaurant_reviews_df, left_on='id', right_on='business_id', how='inner')
    restaurant_reviews_final_df = restaurant_reviews_final_df[[
        'id_x',
        'name',
        'categories_x',
        'location',
        'coordinates',
        'rating_x',
        'review_id',
        'user',
        'rating_y',
        'text',
        'time_created',
        'description',
        'attributes',  # Include extracted attributes
        'transactions',          # Include transactions
        'distance',              # Include distance
        'business_hours'         # Include business_hours
    ]].copy()
    restaurant_reviews_final_df.rename(columns={
        'id_x': 'business_id',
        'categories_x': 'categories',
        'rating_x': 'avg_rating',
        'rating_y': 'review_rating',
        'text': 'review_text',
        'time_created': 'review_date',
        'description': 'description',
        'attributes': 'restaurant_details',
        'transactions': 'transactions',
        'distance': 'distance_from_riverside',  # Added a more descriptive name
        'business_hours': 'business_hours'
    }, inplace=True)
    print("\nFirst 5 rows of the final combined DataFrame (with restaurant details, transactions, distance, and business hours):")
    print(restaurant_reviews_final_df.head())
else:
    print("\nFinal combined DataFrame is empty.")


KeyError: "['categories_x', 'review_id', 'attributes', 'transactions', 'distance', 'business_hours'] not in index"

In [59]:
restaurant_reviews_df = pd.merge(restaurants_df[['id', 'name', 'attributes', 'transactions', 'distance', 'business_hours']], reviews_df, left_on='id', right_on='business_id', how='inner')

In [61]:
restaurant_reviews_df

Unnamed: 0,id_x,description,attributes,transactions,distance,business_hours,id_y,url,text,rating,time_created,user,business_id
0,7XJsjflQAnQTzvQYzAwydA,Description not available. Showing name: Smoke...,"{'business_accepts_android_pay': True, 'busine...",[],2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...",e84DJb5bqU63D-7cObU4pQ,https://www.yelp.com/biz/smoke-and-fire-social...,"Food was good, service was great, Alexa was ve...",4,2025-04-05 12:27:18,"{'id': 'KcxN-hBjh7OsdQJh_0xvWA', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
1,7XJsjflQAnQTzvQYzAwydA,Description not available. Showing name: Smoke...,"{'business_accepts_android_pay': True, 'busine...",[],2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...",rWQ-KpfjLDTBes6LarDuhg,https://www.yelp.com/biz/smoke-and-fire-social...,Absolutely love the atmosphere and the food is...,5,2025-03-23 21:16:35,"{'id': 'DJDC7kwj4qItmsi5BXWC8A', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
2,7XJsjflQAnQTzvQYzAwydA,Description not available. Showing name: Smoke...,"{'business_accepts_android_pay': True, 'busine...",[],2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...",O3Rnl2Ew8uulDA_TBPTDtA,https://www.yelp.com/biz/smoke-and-fire-social...,They ran out of fries and coleslaw and still c...,3,2025-03-03 03:39:00,"{'id': 'S3gYxVAwJcGy76e3gdJZBg', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
3,CG0qu39ItRcAd8J5liITYQ,Description not available. Showing name: Elias...,"{'business_accepts_android_pay': True, 'busine...","[delivery, pickup]",1509.822879,"[{'open': [{'is_overnight': False, 'start': '1...",UV3u1IbW-q38XWgTZgRdnQ,https://www.yelp.com/biz/elias-pita-riverside?...,"Excellent + food was delicious, great portion,...",5,2025-04-06 11:01:58,"{'id': 'kQKosEedp_QFSZS7wyB3Og', 'profile_url'...",CG0qu39ItRcAd8J5liITYQ
4,CG0qu39ItRcAd8J5liITYQ,Description not available. Showing name: Elias...,"{'business_accepts_android_pay': True, 'busine...","[delivery, pickup]",1509.822879,"[{'open': [{'is_overnight': False, 'start': '1...",43QScRvU-r0NLjt-dzG5WQ,https://www.yelp.com/biz/elias-pita-riverside?...,The food was amazing! We had the gyro's and hu...,5,2024-11-08 15:17:32,"{'id': 'ORK17DiBQHAmLYdEjzsnJQ', 'profile_url'...",CG0qu39ItRcAd8J5liITYQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,bMKxq1N2jtl1fIz38GU4aA,Description not available. Showing name: Backs...,"{'business_accepts_android_pay': True, 'busine...","[delivery, pickup]",7142.674700,"[{'open': [{'is_overnight': False, 'start': '1...",dkqAIXdtXXU4P17O5BhYjw,https://www.yelp.com/biz/backstreet-riverside?...,I have been going to Backstreet for more than ...,5,2024-11-25 19:56:28,"{'id': 'Dlv0LYWcx71cS0fZyRqqyA', 'profile_url'...",bMKxq1N2jtl1fIz38GU4aA
596,bMKxq1N2jtl1fIz38GU4aA,Description not available. Showing name: Backs...,"{'business_accepts_android_pay': True, 'busine...","[delivery, pickup]",7142.674700,"[{'open': [{'is_overnight': False, 'start': '1...",_JNOVcifNyjl_mmvxkHIhw,https://www.yelp.com/biz/backstreet-riverside?...,Best sandwich shop in Riverside. A lil pricey ...,5,2024-11-25 14:24:03,"{'id': 'Z9guTC__fTJ9-m-nji7Xqg', 'profile_url'...",bMKxq1N2jtl1fIz38GU4aA
597,kmdrAFVh0A82ZH8aFoZdAQ,Description not available. Showing name: My He...,"{'business_accepts_apple_pay': False, 'busines...","[pickup, delivery]",1167.844957,"[{'open': [{'is_overnight': False, 'start': '1...",NmRpM5Pt5VjInz6Drd89mA,https://www.yelp.com/biz/my-hero-subs-riversid...,I always come here for sandwiches and I realiz...,5,2025-04-04 21:54:51,"{'id': '0QmTpca-tbrYbgjWj2tYJw', 'profile_url'...",kmdrAFVh0A82ZH8aFoZdAQ
598,kmdrAFVh0A82ZH8aFoZdAQ,Description not available. Showing name: My He...,"{'business_accepts_apple_pay': False, 'busines...","[pickup, delivery]",1167.844957,"[{'open': [{'is_overnight': False, 'start': '1...",vzV9-t_O2EmBREmXVK9Tzw,https://www.yelp.com/biz/my-hero-subs-riversid...,Very popular small mom and pop submarine sandw...,4,2025-03-21 12:33:51,"{'id': 'fUrJ14NSDxGG0zHqAYjanw', 'profile_url'...",kmdrAFVh0A82ZH8aFoZdAQ


In [64]:
if not restaurants_df.empty and not reviews_df.empty:
    columns_to_merge = ['id', 'name', 'url', 'categories', 'rating', 'coordinates', 'price', 'location', 'phone', 'distance', 'business_hours']
    restaurants_df_subset = restaurants_df[columns_to_merge]
    restaurant_reviews_df = pd.merge(
        restaurants_df_subset,
        reviews_df,
        left_on='id',
        right_on='business_id',
        how='inner'
    )
    print(f"\nMerged DataFrame shape: {restaurant_reviews_df.shape}")
    print("\nFirst 5 rows of the merged DataFrame:")
    print(restaurant_reviews_df.head())
else:
    restaurant_reviews_df = pd.DataFrame()
    print("\nCould not merge restaurant and review data as one or both DataFrames are empty.")



Merged DataFrame shape: (600, 18)

First 5 rows of the merged DataFrame:
                     id_x                        name  \
0  7XJsjflQAnQTzvQYzAwydA  Smoke & Fire Social Eatery   
1  7XJsjflQAnQTzvQYzAwydA  Smoke & Fire Social Eatery   
2  7XJsjflQAnQTzvQYzAwydA  Smoke & Fire Social Eatery   
3  CG0qu39ItRcAd8J5liITYQ                  Elias Pita   
4  CG0qu39ItRcAd8J5liITYQ                  Elias Pita   

                                               url_x  \
0  https://www.yelp.com/biz/smoke-and-fire-social...   
1  https://www.yelp.com/biz/smoke-and-fire-social...   
2  https://www.yelp.com/biz/smoke-and-fire-social...   
3  https://www.yelp.com/biz/elias-pita-riverside?...   
4  https://www.yelp.com/biz/elias-pita-riverside?...   

                                          categories  rating_x  \
0  [{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...       4.5   
1  [{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...       4.5   
2  [{'alias': 'bbq', 'title': 'Barbeque'}, {'ali

In [65]:
restaurant_reviews_df.head()

Unnamed: 0,id_x,name,url_x,categories,rating_x,coordinates,price,location,phone,distance,business_hours,id_y,url_y,text,rating_y,time_created,user,business_id
0,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,https://www.yelp.com/biz/smoke-and-fire-social...,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...",4.5,"{'latitude': 33.955971, 'longitude': -117.330408}",$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19517427309,2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...",e84DJb5bqU63D-7cObU4pQ,https://www.yelp.com/biz/smoke-and-fire-social...,"Food was good, service was great, Alexa was ve...",4,2025-04-05 12:27:18,"{'id': 'KcxN-hBjh7OsdQJh_0xvWA', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
1,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,https://www.yelp.com/biz/smoke-and-fire-social...,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...",4.5,"{'latitude': 33.955971, 'longitude': -117.330408}",$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19517427309,2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...",rWQ-KpfjLDTBes6LarDuhg,https://www.yelp.com/biz/smoke-and-fire-social...,Absolutely love the atmosphere and the food is...,5,2025-03-23 21:16:35,"{'id': 'DJDC7kwj4qItmsi5BXWC8A', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
2,7XJsjflQAnQTzvQYzAwydA,Smoke & Fire Social Eatery,https://www.yelp.com/biz/smoke-and-fire-social...,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...",4.5,"{'latitude': 33.955971, 'longitude': -117.330408}",$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19517427309,2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...",O3Rnl2Ew8uulDA_TBPTDtA,https://www.yelp.com/biz/smoke-and-fire-social...,They ran out of fries and coleslaw and still c...,3,2025-03-03 03:39:00,"{'id': 'S3gYxVAwJcGy76e3gdJZBg', 'profile_url'...",7XJsjflQAnQTzvQYzAwydA
3,CG0qu39ItRcAd8J5liITYQ,Elias Pita,https://www.yelp.com/biz/elias-pita-riverside?...,"[{'alias': 'mediterranean', 'title': 'Mediterr...",4.3,"{'latitude': 33.97533, 'longitude': -117.34385}",$$,"{'address1': '1490 University Ave', 'address2'...",19516866800,1509.822879,"[{'open': [{'is_overnight': False, 'start': '1...",UV3u1IbW-q38XWgTZgRdnQ,https://www.yelp.com/biz/elias-pita-riverside?...,"Excellent + food was delicious, great portion,...",5,2025-04-06 11:01:58,"{'id': 'kQKosEedp_QFSZS7wyB3Og', 'profile_url'...",CG0qu39ItRcAd8J5liITYQ
4,CG0qu39ItRcAd8J5liITYQ,Elias Pita,https://www.yelp.com/biz/elias-pita-riverside?...,"[{'alias': 'mediterranean', 'title': 'Mediterr...",4.3,"{'latitude': 33.97533, 'longitude': -117.34385}",$$,"{'address1': '1490 University Ave', 'address2'...",19516866800,1509.822879,"[{'open': [{'is_overnight': False, 'start': '1...",43QScRvU-r0NLjt-dzG5WQ,https://www.yelp.com/biz/elias-pita-riverside?...,The food was amazing! We had the gyro's and hu...,5,2024-11-08 15:17:32,"{'id': 'ORK17DiBQHAmLYdEjzsnJQ', 'profile_url'...",CG0qu39ItRcAd8J5liITYQ


In [67]:
if not restaurants_df.empty and not reviews_df.empty:
    # Group reviews by business_id and aggregate review texts into a list
    restaurant_review_texts = reviews_df.groupby('business_id')['text'].agg(list).reset_index()

    # Rename the columns for clarity during the merge
    restaurant_review_texts.rename(columns={'business_id': 'id', 'text': 'review_texts'}, inplace=True)

    # Merge the review texts back into the restaurants_df
    restaurants_df = pd.merge(restaurants_df, restaurant_review_texts, on='id', how='left')

    print("\nFirst 5 rows of the restaurants DataFrame with review texts:")
    print(restaurants_df[['id', 'name', 'review_texts']].head())
else:
    print("\nRestaurants or reviews DataFrame is empty.")




First 5 rows of the restaurants DataFrame with review texts:
                       id                             name  \
0  7XJsjflQAnQTzvQYzAwydA       Smoke & Fire Social Eatery   
1  CG0qu39ItRcAd8J5liITYQ                       Elias Pita   
2  ElH8ZbPSG1rIFA3wXyFn5Q                   The Salted Pig   
3  KIVSXV1MrwIUO8E02M0H6Q                Craving Crab Haus   
4  wsgdv_edV5bvssYk9pyOIg  Georgie’s Mediterranean Cuisine   

                                        review_texts  
0  [Food was good, service was great, Alexa was v...  
1  [Excellent + food was delicious, great portion...  
2  [The Salted Pig has always been a special spot...  
3  [this is the most amazing delicious and custom...  
4  [The food is great, they have good portions! I...  


In [71]:
restaurants_df['categories'].head()

0    [{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...
1    [{'alias': 'mediterranean', 'title': 'Mediterr...
2    [{'alias': 'gastropubs', 'title': 'Gastropubs'...
3    [{'alias': 'cajun', 'title': 'Cajun/Creole'}, ...
4    [{'alias': 'mediterranean', 'title': 'Mediterr...
Name: categories, dtype: object

In [72]:
def clean_categories_alias(categories_list):
    """
    Cleans the categories data to be a list of aliases.
    """
    if isinstance(categories_list, list):
        return [cat['alias'] for cat in categories_list if isinstance(cat, dict) and 'alias' in cat]
    return []


In [73]:
restaurants_df['categories_cleaned_alias'] = restaurants_df['categories'].apply(clean_categories_alias)


In [76]:
# Drop the original 'categories' column
restaurants_df.drop(columns=['categories'], inplace=True)

# Rename the 'categories_cleaned_alias' column to 'categories'
restaurants_df.rename(columns={'categories_cleaned_alias': 'categories'}, inplace=True)

# Display the DataFrame with the renamed and dropped columns
print(restaurants_df[['name', 'categories']].head())


                              name  \
0       Smoke & Fire Social Eatery   
1                       Elias Pita   
2                   The Salted Pig   
3                Craving Crab Haus   
4  Georgie’s Mediterranean Cuisine   

                                    categories  
0             [bbq, tradamerican, comfortfood]  
1           [mediterranean, mideastern, halal]  
2  [gastropubs, newamerican, breakfast_brunch]  
3                 [cajun, sportsbars, seafood]  
4                  [mediterranean, mideastern]  


In [77]:
restaurants_df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,rating,coordinates,transactions,price,location,phone,display_phone,distance,business_hours,attributes,description,review_texts,categories
0,7XJsjflQAnQTzvQYzAwydA,smoke-and-fire-social-eatery-riverside-2,Smoke & Fire Social Eatery,https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...,False,https://www.yelp.com/biz/smoke-and-fire-social...,2672,4.5,"{'latitude': 33.955971, 'longitude': -117.330408}",[],$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19517427309,(951) 742-7309,2642.903707,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",Description not available. Showing name: Smoke...,"[Food was good, service was great, Alexa was v...","[bbq, tradamerican, comfortfood]"
1,CG0qu39ItRcAd8J5liITYQ,elias-pita-riverside,Elias Pita,https://s3-media2.fl.yelpcdn.com/bphoto/_n6TU4...,False,https://www.yelp.com/biz/elias-pita-riverside?...,1160,4.3,"{'latitude': 33.97533, 'longitude': -117.34385}","[delivery, pickup]",$$,"{'address1': '1490 University Ave', 'address2'...",19516866800,(951) 686-6800,1509.822879,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",Description not available. Showing name: Elias...,"[Excellent + food was delicious, great portion...","[mediterranean, mideastern, halal]"
2,ElH8ZbPSG1rIFA3wXyFn5Q,the-salted-pig-riverside,The Salted Pig,https://s3-media3.fl.yelpcdn.com/bphoto/G65RHz...,False,https://www.yelp.com/biz/the-salted-pig-rivers...,3349,3.8,"{'latitude': 33.98226, 'longitude': -117.37386}",[delivery],$$,"{'address1': '3750 Main St', 'address2': 'Ste ...",19517425664,(951) 742-5664,4243.221458,"[{'open': [{'is_overnight': False, 'start': '1...","{'business_accepts_android_pay': True, 'busine...",Description not available. Showing name: The S...,[The Salted Pig has always been a special spot...,"[gastropubs, newamerican, breakfast_brunch]"
3,KIVSXV1MrwIUO8E02M0H6Q,craving-crab-haus-riverside,Craving Crab Haus,https://s3-media2.fl.yelpcdn.com/bphoto/blk-Vh...,False,https://www.yelp.com/biz/craving-crab-haus-riv...,427,4.5,"{'latitude': 33.97674, 'longitude': -117.33752}",[restaurant_reservation],$$,"{'address1': '1201 University Ave', 'address2'...",19517425859,(951) 742-5859,904.674919,"[{'open': [{'is_overnight': False, 'start': '1...","{'has_pool_table': False, 'business_url': 'htt...",Description not available. Showing name: Cravi...,[this is the most amazing delicious and custom...,"[cajun, sportsbars, seafood]"
4,wsgdv_edV5bvssYk9pyOIg,georgie-s-mediterranean-cuisine-riverside,Georgie’s Mediterranean Cuisine,https://s3-media3.fl.yelpcdn.com/bphoto/3WhUY5...,False,https://www.yelp.com/biz/georgie-s-mediterrane...,426,4.5,"{'latitude': 33.95623720813322, 'longitude': -...","[delivery, pickup]",$$,"{'address1': '5225 Canyon Crest Dr', 'address2...",19518230440,(951) 823-0440,2547.020308,"[{'open': [{'is_overnight': False, 'start': '1...",{'business_url': 'https://georgiesmediterranea...,Description not available. Showing name: Georg...,"[The food is great, they have good portions! I...","[mediterranean, mideastern]"


In [78]:
restaurants_df['attributes']

0      {'business_accepts_android_pay': True, 'busine...
1      {'business_accepts_android_pay': True, 'busine...
2      {'business_accepts_android_pay': True, 'busine...
3      {'has_pool_table': False, 'business_url': 'htt...
4      {'business_url': 'https://georgiesmediterranea...
                             ...                        
195    {'business_accepts_android_pay': True, 'busine...
196    {'business_url': None, 'about_this_biz_bio_pho...
197    {'business_accepts_android_pay': True, 'busine...
198    {'business_accepts_android_pay': True, 'busine...
199    {'business_accepts_apple_pay': False, 'busines...
Name: attributes, Length: 200, dtype: object

In [82]:
print(restaurants_df.head(1))

                       id                                     alias  \
0  7XJsjflQAnQTzvQYzAwydA  smoke-and-fire-social-eatery-riverside-2   

                         name  \
0  Smoke & Fire Social Eatery   

                                           image_url  is_closed  \
0  https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj...      False   

                                                 url  review_count  rating  \
0  https://www.yelp.com/biz/smoke-and-fire-social...          2672     4.5   

                                         coordinates transactions price  \
0  {'latitude': 33.955971, 'longitude': -117.330408}           []    $$   

                                            location         phone  \
0  {'address1': '5225 Canyon Crest Dr', 'address2...  +19517427309   

    display_phone     distance  \
0  (951) 742-7309  2642.903707   

                                      business_hours  \
0  [{'open': [{'is_overnight': False, 'start': '1...   

                        

In [84]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py38-none-any.whl.metadata (7.1 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.11-cp38-cp38-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting huggingface-hub>=0.23.0 (from datasets)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Download

In [86]:
from datasets import Dataset

In [90]:
def create_full_detail_dataset(restaurants_df):
    """
    Creates a dataset for fine-tuning a model to generate restaurant descriptions
    using all available fields within the 'input' text.
    """
    data = []
    for index, row in restaurants_df.iterrows():
        if row['review_texts']:
            restaurant_info = {col: row[col] for col in restaurants_df.columns}
            input_text = f"Restaurant details: {json.dumps(restaurant_info, default=str)}. Reviews: {', '.join(row['review_texts'][:5])}"

            output_text = f"A restaurant named {row['name']} serving {row['categories']} in {row['location']['city']}." # Example - improve this!

            data.append({
                'instruction': "Generate a short description for this restaurant based on all its available details and reviews.",
                'input': input_text,
                'output': output_text
            })
    return Dataset.from_pandas(pd.DataFrame(data))



In [93]:
full_detail_dataset = create_full_detail_dataset(restaurants_df)


In [94]:
print(full_detail_dataset[0])


{'instruction': 'Generate a short description for this restaurant based on all its available details and reviews.', 'input': 'Restaurant details: {"id": "7XJsjflQAnQTzvQYzAwydA", "alias": "smoke-and-fire-social-eatery-riverside-2", "name": "Smoke & Fire Social Eatery", "image_url": "https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj4G_U2mlY1VRyJJtg/o.jpg", "is_closed": false, "url": "https://www.yelp.com/biz/smoke-and-fire-social-eatery-riverside-2?adjust_creative=s2u-5RnsfwMJeHOvze7LIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=s2u-5RnsfwMJeHOvze7LIQ", "review_count": 2672, "rating": 4.5, "coordinates": {"latitude": 33.955971, "longitude": -117.330408}, "transactions": [], "price": "$$", "location": {"address1": "5225 Canyon Crest Dr", "address2": "Unit 9", "address3": "", "city": "Riverside", "zip_code": "92507", "country": "US", "state": "CA", "display_address": ["5225 Canyon Crest Dr", "Unit 9", "Riverside, CA 92507"]}, "phone": "+19517427309", "display_phone

In [95]:
def save_dataset_as_jsonl(dataset, filepath):
    """Saves a Hugging Face Dataset as a JSON Lines file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        for item in dataset:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

# Save the dataset as JSON Lines
save_dataset_as_jsonl(full_detail_dataset, 'restaurant_dataset.jsonl')
print("Dataset saved to restaurant_dataset.jsonl")


Dataset saved to restaurant_dataset.jsonl


In [134]:
from datasets import Dataset
import pandas as pd
import json
from collections import defaultdict

def create_descriptive_output_dataset_from_dataset(full_detail_dataset):
    """
    Creates a dataset for fine-tuning a model where the output is a natural
    language description, using the full_detail_dataset as input.
    """
    data = []
    for item in full_detail_dataset:
        if item.get('input'):
            restaurant_details = {}
            try:
                restaurant_info_json = item['input'].split('Restaurant details: ')[1].split('. Reviews:')[0]
                restaurant_details = json.loads(restaurant_info_json)
            except (KeyError, json.JSONDecodeError, IndexError) as e:
                print(f"Error processing item: {item.get('name', 'Unknown')}. Error: {e}")
                continue

            name = restaurant_details.get('name', 'This restaurant')
            categories = restaurant_details.get('categories', [])
            attributes = restaurant_details.get('attributes', {})
            reviews = restaurant_details.get('review_texts', [])

            output_parts = [f"This restaurant {name}"]

            if categories:
                output_parts.append(f"is a {', '.join(categories)} restaurant")

            
            if attributes['ambience'] is not None:
                ambience_str = ", ".join([k.replace('_', ' ') for k, v in attributes.get('ambience', {}).items() if v == True])
                output_parts.append(f"with a {ambience_str} ambiance")

            
            if attributes['good_for_meal'] is not None:
                good_for_meal = ", ".join([k.title() for k, v in attributes.get('good_for_meal', {}).items() if v == True])
                output_parts.append(f"that's good for {good_for_meal}")

            price = restaurant_details.get('price')
            if price:
                output_parts.append(f"and has a price range of {price}")

            takeout = attributes.get('restaurants_take_out')
            if takeout:
                output_parts.append("offering takeout")

            delivery = attributes.get('restaurants_delivery')
            if delivery:
                output_parts.append("offering delivery")

            kid_friendly = attributes.get('good_for_kids')
            if kid_friendly:
                output_parts.append("that's kid-friendly")

            review_summary = ". Customers say"
            for review in reviews:
                review_summary += ", " + review
            output_parts.append(review_summary)
            output_text = " ".join(output_parts)

            data.append({
                'instruction': item['instruction'],
                'input': item['input'],
                'output': output_text.strip()
            })
    return Dataset.from_pandas(pd.DataFrame(data))

# Create the dataset with detailed descriptive outputs using the in-memory dataset
detailed_output_dataset = create_descriptive_output_dataset_from_dataset(full_detail_dataset)
print(f"Length of full_detail_dataset: {len(full_detail_dataset)}")

print(detailed_output_dataset[0])

Length of full_detail_dataset: 200
{'instruction': 'Generate a short description for this restaurant based on all its available details and reviews.', 'input': 'Restaurant details: {"id": "7XJsjflQAnQTzvQYzAwydA", "alias": "smoke-and-fire-social-eatery-riverside-2", "name": "Smoke & Fire Social Eatery", "image_url": "https://s3-media1.fl.yelpcdn.com/bphoto/vGyDDj4G_U2mlY1VRyJJtg/o.jpg", "is_closed": false, "url": "https://www.yelp.com/biz/smoke-and-fire-social-eatery-riverside-2?adjust_creative=s2u-5RnsfwMJeHOvze7LIQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=s2u-5RnsfwMJeHOvze7LIQ", "review_count": 2672, "rating": 4.5, "coordinates": {"latitude": 33.955971, "longitude": -117.330408}, "transactions": [], "price": "$$", "location": {"address1": "5225 Canyon Crest Dr", "address2": "Unit 9", "address3": "", "city": "Riverside", "zip_code": "92507", "country": "US", "state": "CA", "display_address": ["5225 Canyon Crest Dr", "Unit 9", "Riverside, CA 92507"]}, "pho

In [100]:
from datasets import load_dataset

In [135]:

def save_dataset_as_jsonl(dataset, filepath):
    """Saves a Hugging Face Dataset as a JSON Lines file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        for item in dataset:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

# Save the dataset as JSON Lines
save_dataset_as_jsonl(detailed_output_dataset, 'restaurant_dataset.jsonl')
print("Dataset saved to restaurant_dataset.jsonl")


Dataset saved to restaurant_dataset.jsonl
