## Unique ID Acquisition

In [6]:

import requests
import pandas as pd
import time
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from env import YELP_ID, YELP_API_KEY

api_key = YELP_API_KEY
api_endpoint = 'https://api.yelp.com/v3/businesses/search'
limit = 50
offset = 0
unique_business_ids = set()
all_businesses = []

def fetch_page(offset):
    url = f'{api_endpoint}?location=New+York&limit={limit}&offset={offset}'
    headers = {
        'Authorization': f'Bearer {YELP_API_KEY}'
    }

    max_retries = 3  # Maximum number of retry attempts
    retry_delay = 5  # Delay between retries in seconds

    for retry in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise an error for non-200 responses

            if response.status_code == 200:
                return response.json().get('businesses', [])
            else:
                return []

        except requests.exceptions.RequestException as e:
            print(f"Request error (Retry {retry + 1}/{max_retries}): {e}")
            if retry < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Skipping this request.")
                return []

# Concurrently fetch pages of results
with ThreadPoolExecutor(max_workers=1) as executor:  # Adjust max_workers as needed
    while True:
        future_to_offset = {executor.submit(fetch_page, offset): offset}
        offset += limit

        for future in concurrent.futures.as_completed(future_to_offset):
            businesses = future.result()

            if not businesses:
                # No more results to fetch
                break

            for business in businesses:
                business_id = business.get('id')
                if business_id not in unique_business_ids:
                    unique_business_ids.add(business_id)
                    all_businesses.append(business)
                    
                    # retrieve reviews for this business
                    reviews_url = f'https://api.yelp.com/v3/businesses/{business_id}/reviews'
                    headers = {
                        'Authorization': f'Bearer {YELP_API_KEY}'
                    }

                    try:
                        reviews_response = requests.get(reviews_url, headers=headers)
                        reviews_response.raise_for_status()

                        if reviews_response.status_code == 200:
                            reviews_data = reviews_response.json().get('reviews', [])
                            # Add the reviews_data to data structure
                        else:
                            print(f"Failed to retrieve reviews for business {business_id}.")

                    except requests.exceptions.RequestException as e:
                        print(f"Request error for reviews of business {business_id}: {e}")

        print(f"Processed {offset} records")

        # Sleep for 60 seconds to respect QPS rate limiting
        time.sleep(60)

print(f"Total unique business IDs: {len(unique_business_ids)}")

# Now 'all_businesses' contains unique business data

# Convert the data into a DataFrame
df = pd.DataFrame(all_businesses)


Processed 50 records
Processed 100 records
Processed 150 records
Processed 200 records
Processed 250 records
Processed 300 records
Processed 350 records
Processed 400 records
Request error for reviews of business LOeDR4X-YRToeo3hy_plew: 429 Client Error: Too Many Requests for url: https://api.yelp.com/v3/businesses/LOeDR4X-YRToeo3hy_plew/reviews
Request error for reviews of business 9o4y3IqpFDaj8FddJ6oi2A: 429 Client Error: Too Many Requests for url: https://api.yelp.com/v3/businesses/9o4y3IqpFDaj8FddJ6oi2A/reviews
Request error for reviews of business 1aD88qNRb08LGBkl84GQTA: 429 Client Error: Too Many Requests for url: https://api.yelp.com/v3/businesses/1aD88qNRb08LGBkl84GQTA/reviews
Request error for reviews of business g3jNtkQZVZrrC823oBxdtg: 429 Client Error: Too Many Requests for url: https://api.yelp.com/v3/businesses/g3jNtkQZVZrrC823oBxdtg/reviews
Request error for reviews of business K9_vumaLO80LXgoPNJJvRQ: 429 Client Error: Too Many Requests for url: https://api.yelp.com/v3/bu

KeyboardInterrupt: 

In [7]:
df = pd.DataFrame(all_businesses)

In [15]:
df.location[0]

{'address1': '42 E 20th St',
 'address2': '',
 'address3': '',
 'city': 'New York',
 'zip_code': '10003',
 'country': 'US',
 'state': 'NY',
 'display_address': ['42 E 20th St', 'New York, NY 10003']}

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             436 non-null    object 
 1   alias          436 non-null    object 
 2   name           436 non-null    object 
 3   image_url      436 non-null    object 
 4   is_closed      436 non-null    bool   
 5   url            436 non-null    object 
 6   review_count   436 non-null    int64  
 7   categories     436 non-null    object 
 8   rating         436 non-null    float64
 9   coordinates    436 non-null    object 
 10  transactions   436 non-null    object 
 11  price          401 non-null    object 
 12  location       436 non-null    object 
 13  phone          436 non-null    object 
 14  display_phone  436 non-null    object 
 15  distance       436 non-null    float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 51.6+ KB


In [11]:
# df.to_csv('yelp_data.csv', index=None)

In [19]:
import prepare as p

ny = pd.read_csv('ny.csv')

In [21]:
ny = p.clean_ny(ny)

In [26]:
ny.to_csv('ny_cleaned.csv', index=False)

In [28]:
import pandas as pd
import concurrent.futures

# Load dataframes
yelp_df = pd.read_csv('yelp_data.csv')
ny_df = pd.read_csv('ny_cleaned.csv')

# Preprocess the 'name' and 'dba' columns
yelp_df['name'] = yelp_df['name'].str.lower().str.strip()
ny_df['dba'] = ny_df['dba'].str.lower().str.strip()

# Step 1: Create a set of unique business DBAs from the NY DataFrame
unique_ny_dbas = set(ny_df['dba'])

# Step 2: Initialize a dictionary to store matching Yelp IDs
matching_yelp_ids = {}

# Create a function to find the first matching ID for parallel processing
def find_matching_id(yelp_name, yelp_id, ny_df):
    matching_id = None
    for ny_row in ny_df['dba']:
        if yelp_name == ny_row:
            matching_id = yelp_id
            break  # Stop searching after the first match
    return yelp_name, matching_id

# Use ThreadPoolExecutor for parallel processing with 12 cores
with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
    # Submit tasks for each yelp name
    future_to_name = {executor.submit(find_matching_id, yelp_name, yelp_id, ny_df): (yelp_name, yelp_id) for yelp_name, yelp_id in zip(yelp_df['name'], yelp_df['id'])}
    
    # Retrieve results as they complete
    for future in concurrent.futures.as_completed(future_to_name):
        yelp_name, matching_id = future.result()
        if matching_id:
            matching_yelp_ids[yelp_name] = matching_id

# Now, matching_yelp_ids contains Yelp IDs that match unique business DBAs from the NY DataFrame


In [29]:
#'matching_yelp_ids' is a dictionary with business names as keys and lists of matching Yelp IDs as values
data = {'business_name': list(matching_yelp_ids.keys()), 'yelp_ids': [''.join(map(str, ids)) for ids in matching_yelp_ids.values()]}

# Create a DataFrame from the data
matched_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
matched_df.to_csv('matched_businesses.csv', index=False)

In [None]:
# required imports to implement pagination and parallelization
import requests
import pandas as pd
import time
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from env import YELP_ID, YELP_API_KEY

# API credentials and endpoint
api_key = YELP_API_KEY
api_endpoint = 'https://api.yelp.com/v3/businesses/search'

# Pagination parameters per the API businesses reference page
limit = 50  # Number of results per page
offset = 0  # Start with the first page

unique_business_ids = set()  # Set to store unique business IDs
all_businesses = []  # List to store all business data

# Define a function to fetch data for a specific page
def fetch_page(offset):
    url = f'{api_endpoint}?location=New+York&limit={limit}&offset={offset}'
    headers = {
        'Authorization': f'Bearer {YELP_API_KEY}'
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json().get('businesses', [])
    else:
        return []

# Concurrently fetch pages of results
with ThreadPoolExecutor(max_workers=1) as executor:  # Adjust max_workers as needed
    while True:
        future_to_offset = {executor.submit(fetch_page, offset): offset}
        offset += limit

        # Wait for the tasks to complete
        for future in concurrent.futures.as_completed(future_to_offset):
            businesses = future.result()

            if not businesses:
                # No more results to fetch
                break

            for business in businesses:
                business_id = business.get('id')
                if business_id not in unique_business_ids:
                    unique_business_ids.add(business_id)
                    all_businesses.append(business)

        # Sleep for 60 seconds to respect QPS rate limiting
        time.sleep(60)

# Now 'all_businesses' contains unique business data

# Convert the data into a DataFrame
df = pd.DataFrame(all_businesses)

In [30]:
ny_df.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,cuisine_description,action,violation_code,violation_description,score,grade,inspection_type
0,41168748,dunkin,Bronx,880,GARRISON AVENUE,10474,7188614000.0,2022-03-30T00:00:00.000,Not Critical,2023-10-26T06:00:11.000,40.816753,-73.892364,Donuts,Violations cited,10J,Hand wash sign not posted,13.0,A,Cycle Inspection / Initial Inspection
1,41688142,table 87,Brooklyn,620,ATLANTIC AVENUE,11217,9176186000.0,2017-01-25T00:00:00.000,Not Applicable,2023-10-26T06:00:11.000,40.683447,-73.975691,Pizza,No violations,No violation,No violation,0.0,A,Cycle Inspection / Initial Inspection
2,50100336,subway,Brooklyn,8711,3 AVENUE,11209,7186809000.0,2022-04-05T00:00:00.000,Not Critical,2023-10-26T06:00:11.000,40.622569,-74.031412,Sandwiches,Violations cited,09B,Thawing procedures improper.,10.0,A,Cycle Inspection / Initial Inspection
3,50086686,gertie,Brooklyn,58,MARCY AVENUE,11211,7186361000.0,2021-08-25T00:00:00.000,Not Applicable,2023-10-26T06:00:13.000,40.71236,-73.955419,American,No violations,No violation,No violation,0.0,A,Cycle Inspection / Initial Inspection
4,50081121,dunkin,Brooklyn,2492,LINDEN BOULEVARD,11208,7182729000.0,2022-04-04T00:00:00.000,Not Critical,2023-10-26T06:00:11.000,40.666827,-73.871606,Donuts,Violations cited,10J,Hand wash sign not posted,24.0,B,Cycle Inspection / Initial Inspection
