## Clustering Locations

Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

Load data

In [2]:
df_venues = pd.read_csv('../data/processed/venues.csv')
df_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   latitude     47 non-null     float64
 1   longitude    47 non-null     float64
 2   fsq_id       47 non-null     object 
 3   address      47 non-null     object 
 4   country      47 non-null     object 
 5   postal_code  47 non-null     object 
 6   region       47 non-null     object 
 7   restaurants  47 non-null     int64  
 8   groceries    47 non-null     int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 3.4+ KB


Clustering residentials

In [54]:
filtering_labels = ['restaurants','groceries']
df_venues_filtered = df_venues[filtering_labels]

#scale the data first
scaler = StandardScaler()
df_venues_filtered = scaler.fit_transform(df_venues_filtered)
#run k-means clustering on df_venues_filtered
kmeans = KMeans(n_clusters=3, random_state=42).fit(df_venues_filtered)
df_venues['cluster'] = kmeans.labels_
df_venues.head()

Unnamed: 0,latitude,longitude,fsq_id,address,country,postal_code,region,restaurants,groceries,cluster
0,42.12344,-80.077909,4f2293846d867182dff007d9,153 E 13th St,US,16503,PA,41,6,2
1,42.126495,-80.087438,4ef9be2cc512f2277a8b55f8,160 W 8th St,US,16501,PA,49,5,2
2,42.067218,-80.08973,4bf522b498ac0f47fbb564a8,1717 Kuntz Rd,US,16509,PA,26,3,1
3,42.091566,-79.296095,512bd37ae4b01d58196c800a,2109 Southwestern Dr,US,14750,NY,11,3,1
4,42.109142,-79.287287,5143ddc752cd4de5f8eecdec,75 Marine Park Dr,US,14720,NY,1,0,0


In [55]:
#count the number of venues in each cluster
df_venues['cluster'].value_counts()

cluster
0    31
1    14
2     2
Name: count, dtype: int64

In [56]:
#count restaurants and groceries in each cluster
df_venues.groupby('cluster').sum()[['restaurants', 'groceries']]

Unnamed: 0_level_0,restaurants,groceries
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,168,11
1,233,37
2,90,11


We can infer that cluster 1 represents students who dine out frequently, whereas cluster 2 comprises students who exhibit the opposite behavior. Cluster 0, on the other hand, lies somewhere in between these two extremes.

In [63]:
clusters_dict={0:'Moderate dining', 1:'Frequent dining', 2:'Minimal dining'}
#add cluster names to the dataframe
df_venues['recomended for'] = df_venues['cluster'].map(clusters_dict)

In [64]:
#save the cluster to a csv file
df_venues.to_csv('../data/processed/venues_with_clusters.csv', index=False)