### 1. Clustering Business Hubs Across the USA

In [77]:
import pandas as pd
import json

# Load the business.json file, I changed this to my desktop you will have to change it back to
#your own file path 
data = []
with open(r"C:\Users\blubb\Desktop\yelp_academic_dataset_business.json", 'r', encoding='utf-8') as file:  # Add encoding
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            print(f"Problematic line: {line}")

# Create a DataFrame
df = pd.DataFrame(data)

# Extract relevant fields
geo_data = df[['latitude', 'longitude']].dropna()

#df.head()


In [78]:
import sklearn
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=11, random_state=42)
geo_data['cluster'] = kmeans.fit_predict(geo_data[['latitude', 'longitude']])

In [79]:
import plotly.express as px

fig = px.scatter_geo(geo_data, lat='latitude', lon='longitude', color='cluster',
                     color_discrete_sequence=px.colors.qualitative.Set3, scope='usa',
                     title='Business Hubs Across the USA')

cluster_centers = kmeans.cluster_centers_
centers_df = pd.DataFrame(cluster_centers, columns=['latitude', 'longitude'])
centers_df['cluster'] = 'Center'

fig.add_trace(px.scatter_geo(centers_df, lat='latitude', lon='longitude',
                             color_discrete_sequence=['black']).data[0])


#fig.show()

###  2. Clustering Businesses Within Individual Cities

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
pd.set_option("future.no_silent_downcasting", True)

# Extract relevant fields
geo_data = df[['latitude', 'longitude', 'city', 'state', 'stars']].dropna()
#change the integers to a string
geo_data['stars'] = geo_data['stars'].astype(str)

#turn data into an array 
temp = geo_data['stars']
a = np.array(temp)

#turn it back into a dataframe and convert to category matrix
X = pd.DataFrame({'stars' : a})
category_matrix = pd.get_dummies(X['stars'])
#change T/F to 1/0
category_matrix.replace({False: 0, True: 1}, inplace = True)

# Reset index after concatenation
X = pd.concat([geo_data[['latitude', 'longitude']].reset_index(drop=True), 
                category_matrix.reset_index(drop=True)], axis=1)

rating = []
#fucntion to cluster by city and state since many cities share the same name
def cluster_businesses_by_city(city, state):
    # Filter the data for the given city
    city_data = X.loc[(geo_data['state'].reset_index(drop=True) == state) & (geo_data['city'].reset_index(drop=True) == city)]

    # Perform K-Means clustering
    kmeans = KMeans(n_clusters=5, random_state=42)
    city_data = city_data.copy()  # Avoid SettingWithCopyWarning
    city_data['cluster'] = kmeans.fit_predict(city_data)

    #get labels
    labels = kmeans.labels_

    # Get data for cluster 0
    x = 0
    #get average rating for each cluster 
    for x in range(0,5):
        clusterdata = city_data[labels == x]
        cols = clusterdata.iloc[:, 2:10]

        #get star for each row
        cols['true_column'] = cols.agg(lambda s: [col for i, col in enumerate(cols.columns) 
                                            if s[i]], axis=1)

        #get rid of brackets and change to numeric value
        cols['true_column'] = cols['true_column'].str.get(0)
        cols['true_column'] = pd.to_numeric(cols['true_column'], errors = 'coerce').copy()
        #turn NA into 0
        cols['true_column'] = cols['true_column'].fillna(0)

        #find sum and add to list
        rating.append(sum(cols['true_column'])/len(cols))
        
    # Print the cluster details
    print(f"Clustering for {city}:")
    for cluster_id in city_data['cluster'].unique():
        cluster_businesses = city_data[city_data['cluster'] == cluster_id]
        print(f"Cluster {cluster_id}:")
        print(cluster_businesses[['latitude', 'longitude']])  # Adjust based on available fields
        print()

# Example usage
cluster_businesses_by_city('Tucson', 'AZ')


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as la

In [None]:
import folium
import pandas as pd
from sklearn.cluster import KMeans
import geojson
import geopandas as gpd
import osmnx as ox

def cluster_businesses_on_map(city, state):
    # Filter the data for the given city
    city_data = X.loc[(geo_data['state'].reset_index(drop=True) == state) & (geo_data['city'].reset_index(drop=True) == city)]

    # Perform K-Means clustering
    kmeans = KMeans(n_clusters=5, random_state=42)
    city_data = city_data.copy()  # Avoid SettingWithCopyWarning
    city_data['cluster'] = kmeans.fit_predict(city_data)

    # Create a Folium map centered on the city
    city_latitude = city_data['latitude'].mean()
    city_longitude = city_data['longitude'].mean()
    map_city = folium.Map(location=[city_latitude, city_longitude], zoom_start=12)

    # Add the business clusters to the map
    for cluster_id in city_data['cluster'].unique():
        cluster_businesses = city_data[city_data['cluster'] == cluster_id]
        cluster_latitude = cluster_businesses['latitude'].mean()
        cluster_longitude = cluster_businesses['longitude'].mean()
        folium.Marker(
            location=[cluster_latitude, cluster_longitude],
            popup=f"Cluster {cluster_id}",
            icon=folium.Icon(color="blue", icon='star'),
            tooltip=rating[cluster_id] #shows the average cluster rating when hovering mouse over it
        ).add_to(map_city)

    #specific city and state
    city = city + ', ' + state

    #Get city boundary
    gdf = ox.geocode_to_gdf(city)

    #Save boundary into GeoJSON file
    gdf.to_file("city_boundary.geojson", driver="GeoJSON")

    #load the file
    with open('city_boundary.geojson') as cityBound:
        city_geojson = geojson.load(cityBound)

    #and map it out
    folium.GeoJson(
        city_geojson,
        name='City Boundaries',
        style_function=lambda feature: {
            'color': 'blue',
            'weight': 2,
            'fillOpacity': 0.3
        }
    ).add_to(map_city)

    return map_city

# Example usage
map_riv = cluster_businesses_on_map('Tucson', 'AZ')
map_riv