## Location Clustering 
This workbook is to classify the locations in the dataset

In [1]:
import pandas as pd
import numpy as np

dir = 'E:\\Sebnewrepo\\Data\\checkin_data\\dataset_tsmc2014/'
checkin_file = 'dataset_TSMC2014_NYC.txt'

In [2]:
col = ['user_id',
       'poi_id',
       'poi_category_id',
       'poi_category_name',
       'latitude', 
       'longitude',
       'time_offset',
       'UTC_time']
df = pd.read_csv(dir + checkin_file, delimiter = "\t", names = col)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [3]:
# Test how many categories in the dataset
df.shape

(227428, 8)

In [4]:
# number of users and number of POIs
print('Amount of Users: ', len(df['user_id'].unique()))
print('Amount of Items: ', len(df['poi_id'].unique()))

Amount of Users:  1083
Amount of Items:  38333


### Extract datetime info from column UTC_time
use regex to extract time info

In [5]:
# datetime converting
# use regex to extract time info

# extract day of week
import re

def extract_wkd(data):
    re_wkd = r'[A-Z]?[a-z]+\s'
    result = re.match(re_wkd, data)
    result = result[0].split(' ')
    return result[0]
df['day_of_week'] = df['UTC_time'].apply(extract_wkd)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue


In [6]:
df.groupby('day_of_week')['day_of_week'].count()

day_of_week
Fri    33824
Mon    32487
Sat    34401
Sun    32636
Thu    30558
Tue    32129
Wed    31393
Name: day_of_week, dtype: int64

In [7]:
# extract time of the day
from datetime import datetime
from datetime import timedelta
def extract_mth(data):
    result = data.split(' ')[1] + ' ' + data.split(' ')[2] + ' '  + data.split(' ')[3] + ' '  + data.split(' ')[5]
    dtm_str = datetime.strptime(result, '%b %d %H:%M:%S %Y')
    return dtm_str

df['datetime'] = df['UTC_time'].apply(extract_mth)
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week,datetime
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue,2012-04-03 18:00:09
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue,2012-04-03 18:00:25
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue,2012-04-03 18:02:24
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue,2012-04-03 18:02:41
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue,2012-04-03 18:03:00


In [8]:
# add time_offset

df['datetime_real'] = df['datetime']+ pd.to_timedelta(df['time_offset'], unit = 'm')
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week,datetime,datetime_real
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue,2012-04-03 18:00:09,2012-04-03 14:00:09
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue,2012-04-03 18:00:25,2012-04-03 14:00:25
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue,2012-04-03 18:02:24,2012-04-03 14:02:24
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue,2012-04-03 18:02:41,2012-04-03 14:02:41
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue,2012-04-03 18:03:00,2012-04-03 14:03:00


In [9]:
# only keep hour in the timestamp
df['hour_in_the_day'] = pd.Series(val.hour for val in df['datetime_real'])
df.head()

Unnamed: 0,user_id,poi_id,poi_category_id,poi_category_name,latitude,longitude,time_offset,UTC_time,day_of_week,datetime,datetime_real,hour_in_the_day
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012,Tue,2012-04-03 18:00:09,2012-04-03 14:00:09,14
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012,Tue,2012-04-03 18:00:25,2012-04-03 14:00:25,14
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012,Tue,2012-04-03 18:02:24,2012-04-03 14:02:24,14
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012,Tue,2012-04-03 18:02:41,2012-04-03 14:02:41,14
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012,Tue,2012-04-03 18:03:00,2012-04-03 14:03:00,14


In [10]:
pd.Series(val.hour for val in df[df['poi_id'] == '5116fef9e4b066680e7fab07']['datetime_real'])

0    11
1    14
2    15
3    16
4    22
5    23
6     0
7     4
8    15
dtype: int64

### Feature Engineering
The features selected in the dataset is based on statistics of locations acted by user's behavior:
1. number of visits
2. avg visits per users (returned users)
3. Num of visits in one-hour slice
4. Num of visits in weekday/ weekend (one day slice)

In [11]:
# Number of visits
df_f = df.groupby('poi_id')['user_id'].count()
df_f = df_f.reset_index()
df_f.columns = (['poi_id', 'num_visits'])
df_f.head()

Unnamed: 0,poi_id,num_visits
0,3fd66200f964a52000e71ee3,16
1,3fd66200f964a52000e81ee3,2
2,3fd66200f964a52000f11ee3,1
3,3fd66200f964a52001e51ee3,1
4,3fd66200f964a52001e81ee3,17


In [12]:
# avg visits per users
df_2 = df.groupby('poi_id')['user_id'].nunique()
df_2 = df_2.reset_index()
df_f['unique_user'] = df_2['user_id']


In [13]:
df_f['avg_visits_per_user'] = df_f['num_visits']/df_f['unique_user']
df_f = df_f.drop(columns = ['unique_user'])
df_f.head()

Unnamed: 0,poi_id,num_visits,avg_visits_per_user
0,3fd66200f964a52000e71ee3,16,1.0
1,3fd66200f964a52000e81ee3,2,1.0
2,3fd66200f964a52000f11ee3,1,1.0
3,3fd66200f964a52001e51ee3,1,1.0
4,3fd66200f964a52001e81ee3,17,1.0625


In [14]:
# Num of visits in day of the week
m = df.groupby(['poi_id','day_of_week'])['time_offset'].count()
m = m.reset_index()
# pivot table
m_pivot = pd.pivot_table(m, values = 'time_offset', index = 'poi_id', columns = ['day_of_week'])
m_pivot = m_pivot.fillna(0).reset_index()
m_pivot.head()

day_of_week,poi_id,Fri,Mon,Sat,Sun,Thu,Tue,Wed
0,3fd66200f964a52000e71ee3,3.0,0.0,6.0,5.0,1.0,0.0,1.0
1,3fd66200f964a52000e81ee3,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,3fd66200f964a52000f11ee3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3fd66200f964a52001e51ee3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3fd66200f964a52001e81ee3,4.0,0.0,5.0,4.0,2.0,1.0,1.0


In [15]:
# Num of visits in an one-hour slice in a day
n = df.groupby(['poi_id','hour_in_the_day'])['time_offset'].count()
n = n.reset_index()
# pivot table
n_pivot = pd.pivot_table(n, values = 'time_offset', index = 'poi_id', columns = ['hour_in_the_day'])
n_pivot = n_pivot.fillna(0).reset_index()
n_pivot.head()

hour_in_the_day,poi_id,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,3fd66200f964a52000e71ee3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,4.0,2.0,3.0,0.0,1.0
1,3fd66200f964a52000e81ee3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3fd66200f964a52000f11ee3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3fd66200f964a52001e51ee3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3fd66200f964a52001e81ee3,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,1.0


In [17]:
## combine all features together
feature0 = df_f.merge(m_pivot, how = 'left', on = 'poi_id')
feature0 = feature0.merge(n_pivot, how = 'left', on = 'poi_id')
feature = feature0.set_index('poi_id')
feature

Unnamed: 0_level_0,num_visits,avg_visits_per_user,Fri,Mon,Sat,Sun,Thu,Tue,Wed,0,...,14,15,16,17,18,19,20,21,22,23
poi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3fd66200f964a52000e71ee3,16,1.0000,3.0,0.0,6.0,5.0,1.0,0.0,1.0,2.0,...,0.0,0.0,1.0,1.0,1.0,4.0,2.0,3.0,0.0,1.0
3fd66200f964a52000e81ee3,2,1.0000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3fd66200f964a52000f11ee3,1,1.0000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3fd66200f964a52001e51ee3,1,1.0000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3fd66200f964a52001e81ee3,17,1.0625,4.0,0.0,5.0,4.0,2.0,1.0,1.0,4.0,...,0.0,0.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51190f52e4b0f83278d5b9ba,1,1.0000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51194e17e4b0665cf3280ce3,1,1.0000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
511aa583e4b03acfa105f50b,1,1.0000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
511ae28ce4b00516bc52ccae,1,1.0000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [22]:
# get rid of infrequent data
feature = feature[feature['num_visits'] >= 5]

### Clustering
Hierachical clustering is used here

In [23]:
# step 1: normalization
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df_feature = pd.DataFrame(
                        ss.fit_transform(feature), 
                        columns = feature.columns.values.tolist()
)
df_feature

Unnamed: 0,num_visits,avg_visits_per_user,Fri,Mon,Sat,Sun,Thu,Tue,Wed,0,...,14,15,16,17,18,19,20,21,22,23
0,-0.067387,-0.507718,0.061491,-0.522045,0.741566,0.579813,-0.328170,-0.534906,-0.340227,0.898627,...,-0.400400,-0.419252,0.107696,0.00469,-0.059613,1.029032,0.413958,1.344644,-0.399978,0.281844
1,-0.033121,-0.503151,0.257919,-0.522045,0.524522,0.351809,-0.102220,-0.329825,-0.340227,2.083785,...,-0.400400,-0.419252,-0.423028,-0.38573,0.631997,0.302370,0.413958,0.147154,0.222258,0.281844
2,-0.444316,-0.507718,-0.527793,-0.324780,-0.343655,-0.104198,-0.554121,-0.534906,-0.340227,-0.286531,...,-0.400400,-0.419252,-0.423028,-0.38573,-0.405418,-0.424292,-0.403403,0.745899,-0.399978,0.947094
3,-0.101653,-0.502498,-0.134937,-0.324780,-0.126611,0.123805,-0.102220,-0.124744,0.103880,-0.286531,...,-0.400400,0.065404,0.107696,0.00469,-0.405418,-0.424292,0.005278,0.745899,1.466730,-0.383406
4,0.035412,-0.441945,-0.134937,-0.127514,0.741566,0.351809,-0.554121,-0.329825,0.325933,0.306048,...,-0.400400,-0.419252,-0.423028,-0.38573,0.286192,-0.060961,-0.403403,1.943390,2.088966,1.612345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9984,-0.238718,0.223094,-0.527793,-0.522045,-0.343655,-0.560206,0.575633,0.285418,-0.340227,0.898627,...,0.926018,-0.419252,0.107696,0.00469,-0.405418,0.302370,-0.403403,-0.451591,-0.399978,0.281844
9985,-0.375783,-0.069231,-0.331365,-0.324780,-0.343655,-0.332202,-0.102220,-0.329825,-0.562280,0.306048,...,-0.400400,-0.419252,-0.423028,0.00469,-0.405418,-0.424292,0.005278,0.147154,0.844494,0.281844
9986,0.206744,-0.501074,-0.527793,-0.522045,3.780184,0.351809,-0.554121,-0.534906,-0.562280,0.306048,...,0.926018,0.550059,0.638421,-0.38573,-0.405418,0.302370,-0.403403,0.147154,-0.399978,-0.383406
9987,-0.170186,-0.501628,0.454347,-0.522045,0.307477,0.123805,-0.554121,-0.329825,-0.562280,-0.286531,...,-0.400400,0.065404,-0.423028,-0.38573,0.286192,-0.060961,0.413958,0.147154,-0.399978,-0.383406


In [24]:
# make cluster

from sklearn.cluster import AgglomerativeClustering
# use cosine similarity to generate clusters
hc = AgglomerativeClustering(n_clusters = 200, affinity = 'cosine',linkage = 'complete')
labels = hc.fit_predict(df_feature)
print(labels)

[ 52 122  65 ... 126 187 137]


In [25]:
# make poi_id clusters table
poi_cluster = pd.DataFrame({
    'poi_id': feature.index.tolist(),
    'clusters': labels
})
poi_cluster['clusters'] = poi_cluster['clusters'] + 1 # plus 1 offset
poi_cluster

Unnamed: 0,poi_id,clusters
0,3fd66200f964a52000e71ee3,53
1,3fd66200f964a52001e81ee3,123
2,3fd66200f964a52003e51ee3,66
3,3fd66200f964a52003e71ee3,26
4,3fd66200f964a52004e41ee3,37
...,...,...
9984,510188aae4b055abaa8e5ca7,65
9985,51073fcae4b047c0870d2216,168
9986,510c9f9de4b0ec5cc005fc87,127
9987,51140198e4b0874a568cde81,188


In [26]:
# output
poi_cluster.to_csv('poi_cluster.csv', index = False)

#### Test pannel

In [None]:
poi_cluster['clusters'].unique()

In [None]:

test2 = poi_cluster.merge(feature0, how = 'left', on = 'poi_id')
test2.to_csv('test2.csv', index=False)

In [None]:
poi_cluster[poi_cluster['clusters'] == 16]['poi_id']

In [None]:
feature_cluster = feature.reset_index()
feature_cluster = feature_cluster.merge(poi_cluster, how = "left", on = 'poi_id')
feature_cluster.to_csv('feature_cluster.csv')